aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen')
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll9
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir126
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll80
-rw-r--r--llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll100
-rw-r--r--llvm/test/CodeGen/AArch64/peephole-csel.ll5
-rw-r--r--llvm/test/CodeGen/AArch64/rax1.ll46
-rw-r--r--llvm/test/CodeGen/AArch64/zext-shuffle.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll171
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll40
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll35
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll48
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll57
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll11
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll786
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll11
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll632
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll542
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll330
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll330
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll330
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll11
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll330
-rw-r--r--llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir83
-rw-r--r--llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll20
-rw-r--r--llvm/test/CodeGen/BPF/BTF/variant-part.ll87
-rw-r--r--llvm/test/CodeGen/Hexagon/insert-big.ll47
-rw-r--r--llvm/test/CodeGen/Hexagon/qfp-conv.ll35
-rw-r--r--llvm/test/CodeGen/Hexagon/qfp-enabled.ll19
-rw-r--r--llvm/test/CodeGen/Hexagon/qfp-remove-kill.mir95
-rw-r--r--llvm/test/CodeGen/Hexagon/qfp-subreg-bug.mir33
-rw-r--r--llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll21
-rw-r--r--llvm/test/CodeGen/Hexagon/vect/qfp-mix.mir79
-rw-r--r--llvm/test/CodeGen/Hexagon/vect/qfp-zeroinit.mir23
-rw-r--r--llvm/test/CodeGen/Hexagon/vect/unique-vreg-def.ll32
-rw-r--r--llvm/test/CodeGen/PowerPC/all-atomics.ll562
-rw-r--r--llvm/test/CodeGen/PowerPC/atomic-minmax.ll48
-rw-r--r--llvm/test/CodeGen/PowerPC/atomics-regression.ll880
-rw-r--r--llvm/test/CodeGen/PowerPC/atomics.ll12
-rw-r--r--llvm/test/CodeGen/PowerPC/i64_fp_round.ll18
-rw-r--r--llvm/test/CodeGen/PowerPC/ppc-partword-atomic.ll8
-rw-r--r--llvm/test/CodeGen/PowerPC/pr61882.ll4
-rw-r--r--llvm/test/CodeGen/PowerPC/scalar-rounding-ops.ll445
-rw-r--r--llvm/test/CodeGen/PowerPC/sign-ext-atomics.ll6
-rw-r--r--llvm/test/CodeGen/PowerPC/vector-llrint.ll1413
-rw-r--r--llvm/test/CodeGen/PowerPC/vector-lrint.ll1588
-rw-r--r--llvm/test/CodeGen/RISCV/rv32zbs.ll50
-rw-r--r--llvm/test/CodeGen/RISCV/rv64zbs.ll34
-rw-r--r--llvm/test/CodeGen/SystemZ/int-conv-14.ll45
-rw-r--r--llvm/test/CodeGen/SystemZ/int-conv-15.ll45
56 files changed, 5032 insertions, 4837 deletions
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index b215c51..0933e67 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -1371,11 +1371,10 @@ define noundef <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %sc
; CHECK-SD-NEXT: lsr x9, x0, #16
; CHECK-SD-NEXT: adrp x8, .LCPI14_0
; CHECK-SD-NEXT: dup v4.8h, w0
-; CHECK-SD-NEXT: dup v1.8h, w9
-; CHECK-SD-NEXT: fmov s3, w9
-; CHECK-SD-NEXT: sqneg v2.8h, v1.8h
-; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI14_0]
-; CHECK-SD-NEXT: tbl v1.16b, { v2.16b, v3.16b }, v1.16b
+; CHECK-SD-NEXT: ldr q3, [x8, :lo12:.LCPI14_0]
+; CHECK-SD-NEXT: dup v2.8h, w9
+; CHECK-SD-NEXT: sqneg v1.8h, v2.8h
+; CHECK-SD-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v3.16b
; CHECK-SD-NEXT: rev32 v2.8h, v0.8h
; CHECK-SD-NEXT: sqdmull v3.4s, v0.4h, v4.4h
; CHECK-SD-NEXT: sqdmull2 v0.4s, v0.8h, v4.8h
diff --git a/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir b/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir
index 284d624..f34d3ed 100644
--- a/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir
+++ b/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir
@@ -1,16 +1,12 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
-# RUN: | FileCheck --check-prefix=CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,-zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
-# RUN: | FileCheck --check-prefix=CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,+zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
-# RUN: | FileCheck --check-prefix=CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
-# RUN: | FileCheck --check-prefix=CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,+zcz-gpr32,+zcz-gpr64" %s \
-# RUN: | FileCheck --check-prefix=CHECK-NO-ZCM-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,+zcz-gpr32,+zcz-gpr64" %s \
-# RUN: | FileCheck --check-prefix=CHECK-ZCM-ZCZ %s
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcz-gpr32,-zcz-gpr64" %s \
+# RUN: | FileCheck --check-prefix=CHECK-NOZCZ-GPR32-NOZCZ-GPR64 %s
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcz-gpr32,-zcz-gpr64" %s \
+# RUN: | FileCheck --check-prefix=CHECK-ZCZ-GPR32-NOZCZ-GPR64 %s
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcz-gpr32,+zcz-gpr64" %s \
+# RUN: | FileCheck --check-prefix=CHECK-NOZCZ-GPR32-ZCZ-GPR64 %s
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcz-gpr32,+zcz-gpr64" %s \
+# RUN: | FileCheck --check-prefix=CHECK-ZCZ-GPR32-ZCZ-GPR64 %s
--- |
define void @f0(i64 noundef %x) { ret void }
@@ -24,41 +20,29 @@ liveins:
body: |
bb.0:
liveins: $x0, $lr
- ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-LABEL: name: f0
- ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
- ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: {{ $}}
- ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: $w0 = ORRWrr $wzr, $wzr
- ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+ ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-LABEL: name: f0
+ ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64: liveins: $x0, $lr
+ ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: {{ $}}
+ ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: $w0 = ORRWrr $wzr, $wzr
+ ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
;
- ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-LABEL: name: f0
- ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
- ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: {{ $}}
- ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: $w0 = ORRWrr $wzr, $wzr
- ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+ ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-LABEL: name: f0
+ ; CHECK-ZCZ-GPR32-NOZCZ-GPR64: liveins: $x0, $lr
+ ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: {{ $}}
+ ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: $w0 = MOVZWi 0, 0
+ ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
;
- ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-LABEL: name: f0
- ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
- ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: {{ $}}
- ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: $x0 = ORRXrr $xzr, undef $xzr, implicit $wzr
- ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+ ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-LABEL: name: f0
+ ; CHECK-NOZCZ-GPR32-ZCZ-GPR64: liveins: $x0, $lr
+ ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: {{ $}}
+ ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: $w0 = ORRWrr $wzr, $wzr
+ ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
;
- ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-LABEL: name: f0
- ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
- ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: {{ $}}
- ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: $w0 = ORRWrr $wzr, $wzr
- ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
- ;
- ; CHECK-NO-ZCM-ZCZ-LABEL: name: f0
- ; CHECK-NO-ZCM-ZCZ: liveins: $x0, $lr
- ; CHECK-NO-ZCM-ZCZ-NEXT: {{ $}}
- ; CHECK-NO-ZCM-ZCZ-NEXT: $w0 = MOVZWi 0, 0
- ; CHECK-NO-ZCM-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
- ;
- ; CHECK-ZCM-ZCZ-LABEL: name: f0
- ; CHECK-ZCM-ZCZ: liveins: $x0, $lr
- ; CHECK-ZCM-ZCZ-NEXT: {{ $}}
- ; CHECK-ZCM-ZCZ-NEXT: $w0 = MOVZWi 0, 0
- ; CHECK-ZCM-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+ ; CHECK-ZCZ-GPR32-ZCZ-GPR64-LABEL: name: f0
+ ; CHECK-ZCZ-GPR32-ZCZ-GPR64: liveins: $x0, $lr
+ ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: {{ $}}
+ ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: $w0 = MOVZWi 0, 0
+ ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
$w0 = COPY $wzr
BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
...
@@ -69,41 +53,29 @@ liveins:
body: |
bb.0:
liveins: $x0, $lr
- ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-LABEL: name: f1
- ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
- ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: {{ $}}
- ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: $x0 = ORRXrr $xzr, $xzr
- ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
- ;
- ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-LABEL: name: f1
- ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
- ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: {{ $}}
- ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: $x0 = ORRXrr $xzr, $xzr
- ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
- ;
- ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-LABEL: name: f1
- ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
- ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: {{ $}}
- ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: $x0 = ORRXrr $xzr, $xzr
- ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+ ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-LABEL: name: f1
+ ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64: liveins: $x0, $lr
+ ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: {{ $}}
+ ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: $x0 = ORRXrr $xzr, $xzr
+ ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
;
- ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-LABEL: name: f1
- ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
- ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: {{ $}}
- ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: $x0 = ORRXrr $xzr, $xzr
- ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+ ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-LABEL: name: f1
+ ; CHECK-ZCZ-GPR32-NOZCZ-GPR64: liveins: $x0, $lr
+ ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: {{ $}}
+ ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: $x0 = ORRXrr $xzr, $xzr
+ ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
;
- ; CHECK-NO-ZCM-ZCZ-LABEL: name: f1
- ; CHECK-NO-ZCM-ZCZ: liveins: $x0, $lr
- ; CHECK-NO-ZCM-ZCZ-NEXT: {{ $}}
- ; CHECK-NO-ZCM-ZCZ-NEXT: $x0 = MOVZXi 0, 0
- ; CHECK-NO-ZCM-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+ ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-LABEL: name: f1
+ ; CHECK-NOZCZ-GPR32-ZCZ-GPR64: liveins: $x0, $lr
+ ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: {{ $}}
+ ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: $x0 = MOVZXi 0, 0
+ ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
;
- ; CHECK-ZCM-ZCZ-LABEL: name: f1
- ; CHECK-ZCM-ZCZ: liveins: $x0, $lr
- ; CHECK-ZCM-ZCZ-NEXT: {{ $}}
- ; CHECK-ZCM-ZCZ-NEXT: $x0 = MOVZXi 0, 0
- ; CHECK-ZCM-ZCZ-NEXT:BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+ ; CHECK-ZCZ-GPR32-ZCZ-GPR64-LABEL: name: f1
+ ; CHECK-ZCZ-GPR32-ZCZ-GPR64: liveins: $x0, $lr
+ ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: {{ $}}
+ ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: $x0 = MOVZXi 0, 0
+ ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
$x0 = COPY $xzr
BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
...
diff --git a/llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll b/llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll
index 97a7741..849323f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll
@@ -421,3 +421,83 @@ for.body51: ; preds = %is_sbox.exit155
unreachable
}
declare fastcc void @get_switch_type(i32, i32, i16 signext, i16 signext, ptr nocapture) nounwind ssp
+
+; CHECK-LABEL: fold_imm1_csinc_32:
+; CHECK: cmp w0, w1
+; CHECK-NEXT: csinc w0, w2, wzr, ge
+; CHECK-NEXT: ret
+define i32 @fold_imm1_csinc_32(i32 %x, i32 %y, i32 %n) nounwind ssp {
+entry:
+ %cmp = icmp slt i32 %x, %y
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ br label %exit
+
+if.else:
+ br label %exit
+
+exit:
+ %result = phi i32 [ 1, %if.then ], [ %n, %if.else ]
+ ret i32 %result
+}
+
+; CHECK-LABEL: fold_imm1_csinc_64:
+; CHECK: cmp x0, x1
+; CHECK-NEXT: csinc x0, x2, xzr, ge
+; CHECK-NEXT: ret
+define i64 @fold_imm1_csinc_64(i64 %x, i64 %y, i64 %n) nounwind ssp {
+entry:
+ %cmp = icmp slt i64 %x, %y
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ br label %exit
+
+if.else:
+ br label %exit
+
+exit:
+ %result = phi i64 [ 1, %if.then ], [ %n, %if.else ]
+ ret i64 %result
+}
+
+; CHECK-LABEL: fold_imm1_cset_32:
+; CHECK: cmp w0, w1
+; CHECK-NEXT: cset w0, lt
+; CHECK-NEXT: ret
+define i32 @fold_imm1_cset_32(i32 %x, i32 %y) nounwind ssp {
+entry:
+ %cmp = icmp slt i32 %x, %y
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ br label %exit
+
+if.else:
+ br label %exit
+
+exit:
+ %result = phi i32 [ 1, %if.then ], [ 0, %if.else ]
+ ret i32 %result
+}
+
+; CHECK-LABEL: fold_imm1_cset_64:
+; CHECK: cmp x0, x1
+; CHECK-NEXT: cset x0, lt
+; CHECK-NEXT: ret
+define i64 @fold_imm1_cset_64(i64 %x, i64 %y) nounwind ssp {
+entry:
+ %cmp = icmp slt i64 %x, %y
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ br label %exit
+
+if.else:
+ br label %exit
+
+exit:
+ %result = phi i64 [ 1, %if.then ], [ 0, %if.else ]
+ ret i64 %result
+}
diff --git a/llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll b/llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll
index c4de177..d7a2a83 100644
--- a/llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll
@@ -5,32 +5,30 @@ define <16 x i8> @lower_trunc_16xi8(i16 %a, i16 %b, i16 %c, i16 %d, i16 %e, i16
; CHECK-LABEL: lower_trunc_16xi8:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov s0, w0
-; CHECK-NEXT: ldr h1, [sp]
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: mov v0.b[1], w1
+; CHECK-NEXT: mov v0.b[2], w2
+; CHECK-NEXT: mov v0.b[3], w3
+; CHECK-NEXT: mov v0.b[4], w4
+; CHECK-NEXT: mov v0.b[5], w5
+; CHECK-NEXT: mov v0.b[6], w6
+; CHECK-NEXT: mov v0.b[7], w7
+; CHECK-NEXT: ld1 { v0.b }[8], [x8]
; CHECK-NEXT: add x8, sp, #8
-; CHECK-NEXT: ld1 { v1.h }[1], [x8]
+; CHECK-NEXT: ld1 { v0.b }[9], [x8]
; CHECK-NEXT: add x8, sp, #16
-; CHECK-NEXT: mov v0.h[1], w1
-; CHECK-NEXT: ld1 { v1.h }[2], [x8]
+; CHECK-NEXT: ld1 { v0.b }[10], [x8]
; CHECK-NEXT: add x8, sp, #24
-; CHECK-NEXT: mov v0.h[2], w2
-; CHECK-NEXT: ld1 { v1.h }[3], [x8]
+; CHECK-NEXT: ld1 { v0.b }[11], [x8]
; CHECK-NEXT: add x8, sp, #32
-; CHECK-NEXT: mov v0.h[3], w3
-; CHECK-NEXT: ld1 { v1.h }[4], [x8]
+; CHECK-NEXT: ld1 { v0.b }[12], [x8]
; CHECK-NEXT: add x8, sp, #40
-; CHECK-NEXT: ld1 { v1.h }[5], [x8]
+; CHECK-NEXT: ld1 { v0.b }[13], [x8]
; CHECK-NEXT: add x8, sp, #48
-; CHECK-NEXT: mov v0.h[4], w4
-; CHECK-NEXT: ld1 { v1.h }[6], [x8]
+; CHECK-NEXT: ld1 { v0.b }[14], [x8]
; CHECK-NEXT: add x8, sp, #56
-; CHECK-NEXT: mov v0.h[5], w5
-; CHECK-NEXT: ld1 { v1.h }[7], [x8]
-; CHECK-NEXT: mov v0.h[6], w6
-; CHECK-NEXT: add v2.8h, v1.8h, v1.8h
-; CHECK-NEXT: mov v0.h[7], w7
-; CHECK-NEXT: add v3.8h, v0.8h, v0.8h
-; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: uzp1 v1.16b, v3.16b, v2.16b
+; CHECK-NEXT: ld1 { v0.b }[15], [x8]
+; CHECK-NEXT: add v1.16b, v0.16b, v0.16b
; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%a1 = insertelement <16 x i16> poison, i16 %a, i16 0
@@ -59,18 +57,15 @@ define <16 x i8> @lower_trunc_16xi8(i16 %a, i16 %b, i16 %c, i16 %d, i16 %e, i16
define <8 x i16> @lower_trunc_8xi16(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) {
; CHECK-LABEL: lower_trunc_8xi16:
; CHECK: // %bb.0:
-; CHECK-NEXT: fmov s0, w4
-; CHECK-NEXT: fmov s1, w0
-; CHECK-NEXT: mov v0.s[1], w5
-; CHECK-NEXT: mov v1.s[1], w1
-; CHECK-NEXT: mov v0.s[2], w6
-; CHECK-NEXT: mov v1.s[2], w2
-; CHECK-NEXT: mov v0.s[3], w7
-; CHECK-NEXT: mov v1.s[3], w3
-; CHECK-NEXT: add v2.4s, v0.4s, v0.4s
-; CHECK-NEXT: add v3.4s, v1.4s, v1.4s
-; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h
-; CHECK-NEXT: uzp1 v1.8h, v3.8h, v2.8h
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: mov v0.h[1], w1
+; CHECK-NEXT: mov v0.h[2], w2
+; CHECK-NEXT: mov v0.h[3], w3
+; CHECK-NEXT: mov v0.h[4], w4
+; CHECK-NEXT: mov v0.h[5], w5
+; CHECK-NEXT: mov v0.h[6], w6
+; CHECK-NEXT: mov v0.h[7], w7
+; CHECK-NEXT: add v1.8h, v0.8h, v0.8h
; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%a1 = insertelement <8 x i32> poison, i32 %a, i32 0
@@ -91,14 +86,11 @@ define <8 x i16> @lower_trunc_8xi16(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32
define <4 x i32> @lower_trunc_4xi32(i64 %a, i64 %b, i64 %c, i64 %d) {
; CHECK-LABEL: lower_trunc_4xi32:
; CHECK: // %bb.0:
-; CHECK-NEXT: fmov d0, x2
-; CHECK-NEXT: fmov d1, x0
-; CHECK-NEXT: mov v0.d[1], x3
-; CHECK-NEXT: mov v1.d[1], x1
-; CHECK-NEXT: add v2.2d, v0.2d, v0.2d
-; CHECK-NEXT: add v3.2d, v1.2d, v1.2d
-; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: uzp1 v1.4s, v3.4s, v2.4s
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: mov v0.s[1], w1
+; CHECK-NEXT: mov v0.s[2], w2
+; CHECK-NEXT: mov v0.s[3], w3
+; CHECK-NEXT: add v1.4s, v0.4s, v0.4s
; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%a1 = insertelement <4 x i64> poison, i64 %a, i64 0
@@ -115,24 +107,20 @@ define <4 x i32> @lower_trunc_4xi32(i64 %a, i64 %b, i64 %c, i64 %d) {
define <8 x i32> @lower_trunc_8xi32(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h) {
; CHECK-LABEL: lower_trunc_8xi32:
; CHECK: // %bb.0:
-; CHECK-NEXT: fmov d0, x2
-; CHECK-NEXT: fmov d1, x0
-; CHECK-NEXT: fmov d2, x6
-; CHECK-NEXT: fmov d3, x4
-; CHECK-NEXT: mov v0.d[1], x3
-; CHECK-NEXT: mov v1.d[1], x1
-; CHECK-NEXT: mov v2.d[1], x7
-; CHECK-NEXT: mov v3.d[1], x5
-; CHECK-NEXT: add v4.2d, v0.2d, v0.2d
-; CHECK-NEXT: add v5.2d, v1.2d, v1.2d
-; CHECK-NEXT: add v6.2d, v2.2d, v2.2d
-; CHECK-NEXT: add v7.2d, v3.2d, v3.2d
+; CHECK-NEXT: fmov d0, x6
+; CHECK-NEXT: fmov d1, x4
+; CHECK-NEXT: fmov d2, x2
+; CHECK-NEXT: fmov d3, x0
+; CHECK-NEXT: mov v0.d[1], x7
+; CHECK-NEXT: mov v1.d[1], x5
+; CHECK-NEXT: mov v2.d[1], x3
+; CHECK-NEXT: mov v3.d[1], x1
+; CHECK-NEXT: uzp1 v1.4s, v1.4s, v0.4s
; CHECK-NEXT: uzp1 v2.4s, v3.4s, v2.4s
-; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: uzp1 v3.4s, v5.4s, v4.4s
-; CHECK-NEXT: uzp1 v1.4s, v7.4s, v6.4s
-; CHECK-NEXT: eor v0.16b, v0.16b, v3.16b
-; CHECK-NEXT: eor v1.16b, v2.16b, v1.16b
+; CHECK-NEXT: add v3.4s, v1.4s, v1.4s
+; CHECK-NEXT: add v0.4s, v2.4s, v2.4s
+; CHECK-NEXT: eor v1.16b, v1.16b, v3.16b
+; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b
; CHECK-NEXT: ret
%a1 = insertelement <8 x i64> poison, i64 %a, i64 0
%b1 = insertelement <8 x i64> %a1, i64 %b, i64 1
diff --git a/llvm/test/CodeGen/AArch64/peephole-csel.ll b/llvm/test/CodeGen/AArch64/peephole-csel.ll
index 868b9f1..b0852580 100644
--- a/llvm/test/CodeGen/AArch64/peephole-csel.ll
+++ b/llvm/test/CodeGen/AArch64/peephole-csel.ll
@@ -5,10 +5,9 @@ define void @peephole_csel(ptr %dst, i1 %0, i1 %cmp) {
; CHECK-LABEL: peephole_csel:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: tst w2, #0x1
-; CHECK-NEXT: mov w8, #1 // =0x1
-; CHECK-NEXT: mov x9, xzr
+; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: tst w1, #0x1
-; CHECK-NEXT: csel x8, x8, x9, eq
+; CHECK-NEXT: csinc x8, x8, xzr, ne
; CHECK-NEXT: str x8, [x0]
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/rax1.ll b/llvm/test/CodeGen/AArch64/rax1.ll
index f679e90..4bb551f 100644
--- a/llvm/test/CodeGen/AArch64/rax1.ll
+++ b/llvm/test/CodeGen/AArch64/rax1.ll
@@ -1,22 +1,44 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefix=SHA3 %s
-; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefix=NOSHA3 %s
+; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefixes=SHA3,SHA3-SD %s
+; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefixes=NOSHA3,NOSHA3-SD %s
+; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s -global-isel | FileCheck --check-prefixes=SHA3,SHA3-GI %s
+; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s -global-isel | FileCheck --check-prefixes=NOSHA3,NOSHA3-GI %s
define <2 x i64> @rax1(<2 x i64> %x, <2 x i64> %y) {
-; SHA3-LABEL: rax1:
-; SHA3: // %bb.0:
-; SHA3-NEXT: rax1 v0.2d, v0.2d, v1.2d
-; SHA3-NEXT: ret
+; SHA3-SD-LABEL: rax1:
+; SHA3-SD: // %bb.0:
+; SHA3-SD-NEXT: rax1 v0.2d, v0.2d, v1.2d
+; SHA3-SD-NEXT: ret
;
-; NOSHA3-LABEL: rax1:
-; NOSHA3: // %bb.0:
-; NOSHA3-NEXT: add v2.2d, v1.2d, v1.2d
-; NOSHA3-NEXT: usra v2.2d, v1.2d, #63
-; NOSHA3-NEXT: eor v0.16b, v0.16b, v2.16b
-; NOSHA3-NEXT: ret
+; NOSHA3-SD-LABEL: rax1:
+; NOSHA3-SD: // %bb.0:
+; NOSHA3-SD-NEXT: add v2.2d, v1.2d, v1.2d
+; NOSHA3-SD-NEXT: usra v2.2d, v1.2d, #63
+; NOSHA3-SD-NEXT: eor v0.16b, v0.16b, v2.16b
+; NOSHA3-SD-NEXT: ret
+;
+; SHA3-GI-LABEL: rax1:
+; SHA3-GI: // %bb.0:
+; SHA3-GI-NEXT: shl v2.2d, v1.2d, #1
+; SHA3-GI-NEXT: ushr v1.2d, v1.2d, #63
+; SHA3-GI-NEXT: orr v1.16b, v2.16b, v1.16b
+; SHA3-GI-NEXT: eor v0.16b, v0.16b, v1.16b
+; SHA3-GI-NEXT: ret
+;
+; NOSHA3-GI-LABEL: rax1:
+; NOSHA3-GI: // %bb.0:
+; NOSHA3-GI-NEXT: shl v2.2d, v1.2d, #1
+; NOSHA3-GI-NEXT: ushr v1.2d, v1.2d, #63
+; NOSHA3-GI-NEXT: orr v1.16b, v2.16b, v1.16b
+; NOSHA3-GI-NEXT: eor v0.16b, v0.16b, v1.16b
+; NOSHA3-GI-NEXT: ret
%a = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %y, <2 x i64> %y, <2 x i64> <i64 1, i64 1>)
%b = xor <2 x i64> %x, %a
ret <2 x i64> %b
}
declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; NOSHA3: {{.*}}
+; SHA3: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/zext-shuffle.ll b/llvm/test/CodeGen/AArch64/zext-shuffle.ll
index 20d2071..a0d4e18 100644
--- a/llvm/test/CodeGen/AArch64/zext-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/zext-shuffle.ll
@@ -674,10 +674,8 @@ define <4 x i32> @isUndefDeInterleave_t1_bad(<8 x i16> %a) {
define i16 @undeftop(<8 x i16> %0) {
; CHECK-LABEL: undeftop:
; CHECK: // %bb.0:
-; CHECK-NEXT: dup v0.8h, v0.h[4]
-; CHECK-NEXT: uaddl v0.4s, v0.4h, v0.4h
-; CHECK-NEXT: xtn v0.4h, v0.4s
-; CHECK-NEXT: umov w0, v0.h[0]
+; CHECK-NEXT: add v0.8h, v0.8h, v0.8h
+; CHECK-NEXT: umov w0, v0.h[4]
; CHECK-NEXT: ret
%2 = shufflevector <8 x i16> %0, <8 x i16> zeroinitializer, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 9, i32 7, i32 5, i32 3>
%3 = zext <8 x i16> %2 to <8 x i64>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
index 002c03aa..e86f747 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
@@ -551,7 +551,9 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 {
;
; GFX10CU-LABEL: name: workgroup_one_as_release
; GFX10CU: bb.0.entry:
+ ; GFX10CU-NEXT: S_WAITCNT_soft 16240
; GFX10CU-NEXT: S_WAITCNT_lds_direct
+ ; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_release
@@ -562,6 +564,8 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 {
;
; GFX11CU-LABEL: name: workgroup_one_as_release
; GFX11CU: bb.0.entry:
+ ; GFX11CU-NEXT: S_WAITCNT_soft 1015
+ ; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX11CU-NEXT: S_ENDPGM 0
entry:
fence syncscope("workgroup-one-as") release
@@ -587,7 +591,9 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 {
;
; GFX10CU-LABEL: name: workgroup_one_as_acq_rel
; GFX10CU: bb.0.entry:
+ ; GFX10CU-NEXT: S_WAITCNT_soft 16240
; GFX10CU-NEXT: S_WAITCNT_lds_direct
+ ; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_acq_rel
@@ -599,6 +605,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 {
;
; GFX11CU-LABEL: name: workgroup_one_as_acq_rel
; GFX11CU: bb.0.entry:
+ ; GFX11CU-NEXT: S_WAITCNT_soft 1015
+ ; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX11CU-NEXT: S_ENDPGM 0
entry:
fence syncscope("workgroup-one-as") acq_rel
@@ -624,7 +632,9 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 {
;
; GFX10CU-LABEL: name: workgroup_one_as_seq_cst
; GFX10CU: bb.0.entry:
+ ; GFX10CU-NEXT: S_WAITCNT_soft 16240
; GFX10CU-NEXT: S_WAITCNT_lds_direct
+ ; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_seq_cst
@@ -636,6 +646,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 {
;
; GFX11CU-LABEL: name: workgroup_one_as_seq_cst
; GFX11CU: bb.0.entry:
+ ; GFX11CU-NEXT: S_WAITCNT_soft 1015
+ ; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX11CU-NEXT: S_ENDPGM 0
entry:
fence syncscope("workgroup-one-as") seq_cst
@@ -1305,8 +1317,9 @@ define amdgpu_kernel void @workgroup_release() #0 {
;
; GFX10CU-LABEL: name: workgroup_release
; GFX10CU: bb.0.entry:
- ; GFX10CU-NEXT: S_WAITCNT_soft 49279
+ ; GFX10CU-NEXT: S_WAITCNT_soft 112
; GFX10CU-NEXT: S_WAITCNT_lds_direct
+ ; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_release
@@ -1317,7 +1330,8 @@ define amdgpu_kernel void @workgroup_release() #0 {
;
; GFX11CU-LABEL: name: workgroup_release
; GFX11CU: bb.0.entry:
- ; GFX11CU-NEXT: S_WAITCNT_soft 64519
+ ; GFX11CU-NEXT: S_WAITCNT_soft 7
+ ; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX11CU-NEXT: S_ENDPGM 0
entry:
fence syncscope("workgroup") release
@@ -1345,8 +1359,9 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
;
; GFX10CU-LABEL: name: workgroup_acq_rel
; GFX10CU: bb.0.entry:
- ; GFX10CU-NEXT: S_WAITCNT_soft 49279
+ ; GFX10CU-NEXT: S_WAITCNT_soft 112
; GFX10CU-NEXT: S_WAITCNT_lds_direct
+ ; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_acq_rel
@@ -1358,7 +1373,8 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
;
; GFX11CU-LABEL: name: workgroup_acq_rel
; GFX11CU: bb.0.entry:
- ; GFX11CU-NEXT: S_WAITCNT_soft 64519
+ ; GFX11CU-NEXT: S_WAITCNT_soft 7
+ ; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX11CU-NEXT: S_ENDPGM 0
entry:
fence syncscope("workgroup") acq_rel
@@ -1386,8 +1402,9 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
;
; GFX10CU-LABEL: name: workgroup_seq_cst
; GFX10CU: bb.0.entry:
- ; GFX10CU-NEXT: S_WAITCNT_soft 49279
+ ; GFX10CU-NEXT: S_WAITCNT_soft 112
; GFX10CU-NEXT: S_WAITCNT_lds_direct
+ ; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_seq_cst
@@ -1399,7 +1416,8 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
;
; GFX11CU-LABEL: name: workgroup_seq_cst
; GFX11CU: bb.0.entry:
- ; GFX11CU-NEXT: S_WAITCNT_soft 64519
+ ; GFX11CU-NEXT: S_WAITCNT_soft 7
+ ; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX11CU-NEXT: S_ENDPGM 0
entry:
fence syncscope("workgroup") seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
index 53b2542..142290a 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -3611,10 +3611,10 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0 glc
+; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: buffer_load_ushort v6, off, s[16:19], 0 offset:4 glc
-; SDAG-NEXT: s_waitcnt vmcnt(1)
-; SDAG-NEXT: v_lshrrev_b32_e32 v7, 8, v0
; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshrrev_b32_e32 v7, 8, v0
; SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v6
; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -3627,12 +3627,12 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0 glc
+; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:4 glc
-; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
-; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v4
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -3652,6 +3652,7 @@ define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: buffer_store_short v4, off, s[16:19], 0 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -3671,6 +3672,7 @@ define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GISEL-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:4
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
index b91963f..d23509b 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
@@ -150,7 +150,6 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
; GFX10CU-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
; GFX10CU-NEXT: v_mov_b32_e32 v0, s13
; GFX10CU-NEXT: s_waitcnt vmcnt(0)
-; GFX10CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX10CU-NEXT: s_barrier
; GFX10CU-NEXT: ds_read_b32 v0, v0
; GFX10CU-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
index 5d03dfb..352af04 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
@@ -218,3 +218,174 @@ main_body:
ret void
}
+define amdgpu_ps void @global_load_lds_dword_volatile(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_volatile:
+; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: s_mov_b32 m0, s0
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_load_dword v[0:1], off offset:256 lds
+; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_volatile:
+; GFX942: ; %bb.0: ; %main_body
+; GFX942-NEXT: s_mov_b32 m0, s0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:256
+; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_volatile:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: s_mov_b32 m0, s0
+; GFX10-NEXT: global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_load_dword v[0:1], off offset:256 lds
+; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_volatile:
+; GFX942-GISEL: ; %bb.0: ; %main_body
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s0
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:256
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+ call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 256, i32 0)
+ call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dword_volatile(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: buffer_load_lds_dword_volatile:
+; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT: s_mov_b32 m0, s5
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: buffer_load_lds_dword_volatile:
+; GFX942: ; %bb.0: ; %main_body
+; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT: s_mov_b32 m0, s5
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT: s_endpgm
+;
+; GFX10-LABEL: buffer_load_lds_dword_volatile:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT: s_mov_b32 m0, s5
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: buffer_load_lds_dword_volatile:
+; GFX942-GISEL: ; %bb.0: ; %main_body
+; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT: s_endpgm
+main_body:
+ %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+ call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+ call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 256, i32 0)
+ call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_load_lds_dword_nontemporal(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_nontemporal:
+; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_mov_b32 m0, s2
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_load_dword v0, s[0:1] glc slc lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_nontemporal:
+; GFX942: ; %bb.0: ; %main_body
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_mov_b32 m0, s2
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_load_lds_dword v0, s[0:1] nt
+; GFX942-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_nontemporal:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_mov_b32 m0, s2
+; GFX10-NEXT: global_load_dword v0, s[0:1] slc lds
+; GFX10-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_nontemporal:
+; GFX942-GISEL: ; %bb.0: ; %main_body
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: global_load_lds_dword v0, s[0:1] nt
+; GFX942-GISEL-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+ ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dword_nontemporal(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: buffer_load_lds_dword_nontemporal:
+; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT: s_mov_b32 m0, s5
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc slc lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: buffer_load_lds_dword_nontemporal:
+; GFX942: ; %bb.0: ; %main_body
+; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT: s_mov_b32 m0, s5
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-NEXT: s_endpgm
+;
+; GFX10-LABEL: buffer_load_lds_dword_nontemporal:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT: s_mov_b32 m0, s5
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen slc lds
+; GFX10-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: buffer_load_lds_dword_nontemporal:
+; GFX942-GISEL: ; %bb.0: ; %main_body
+; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-GISEL-NEXT: s_endpgm
+main_body:
+ %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+ call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+ ret void
+}
+
+!0 = !{i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll
index f0204bd..1dcd032 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll
@@ -110,3 +110,43 @@ main_body:
call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 1, i32 0, i32 0, i32 2048, i32 0)
ret void
}
+
+define amdgpu_ps float @buffer_load_lds_dword_volatile(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
+; GCN-LABEL: buffer_load_lds_dword_volatile:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_mov_b32 m0, s4
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_load_dword off, s[0:3], 0 glc lds
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword off, s[0:3], 0 offset:256 lds
+; GCN-NEXT: buffer_load_dword off, s[0:3], 0 offset:512 lds
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ds_read_b32 v0, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
+main_body:
+ call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 2147483648)
+ call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 256, i32 0)
+ call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 512, i32 0)
+ %res = load float, ptr addrspace(3) %lds
+ ret float %res
+}
+
+define amdgpu_ps float @buffer_load_lds_dword_nontemporal(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
+; GCN-LABEL: buffer_load_lds_dword_nontemporal:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_mov_b32 m0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: buffer_load_dword off, s[0:3], 0 glc slc lds
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ds_read_b32 v0, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
+main_body:
+ call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0), !nontemporal !0
+ %res = load float, ptr addrspace(3) %lds
+ ret float %res
+}
+
+!0 = !{i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
index b5d741b..a979999 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
@@ -80,25 +80,29 @@ define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_volatile(p
; PREGFX10-LABEL: buffer_load_volatile:
; PREGFX10: ; %bb.0: ; %main_body
; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
+; PREGFX10-NEXT: s_waitcnt vmcnt(0)
; PREGFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc
+; PREGFX10-NEXT: s_waitcnt vmcnt(0)
; PREGFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc
; PREGFX10-NEXT: s_waitcnt vmcnt(0)
; PREGFX10-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: buffer_load_volatile:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: buffer_load_volatile:
; GFX11: ; %bb.0: ; %main_body
-; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[0:3], 0 glc slc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
index e8b8d05..e8eccb0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
@@ -147,14 +147,13 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
; GFX8-OPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8-OPT-NEXT: v_mov_b32_e32 v2, 0
; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-OPT-NEXT: s_barrier
-; GFX8-OPT-NEXT: v_add_u32_e32 v1, vcc, v1, v1
-; GFX8-OPT-NEXT: s_nop 1
-; GFX8-OPT-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
-; GFX8-OPT-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-OPT-NEXT: v_add_u32_e32 v4, vcc, v1, v1
+; GFX8-OPT-NEXT: v_mov_b32_e32 v3, s1
; GFX8-OPT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-OPT-NEXT: v_mov_b32_dpp v2, v4 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
+; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-OPT-NEXT: v_add_u32_e32 v2, vcc, v2, v4
+; GFX8-OPT-NEXT: s_barrier
; GFX8-OPT-NEXT: flat_store_dword v[0:1], v2
; GFX8-OPT-NEXT: s_endpgm
;
@@ -194,14 +193,14 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: ds_read_b32 v1, v0
-; GFX10-NEXT: s_barrier
-; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0
-; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v1
-; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
-; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v3, v1, v1
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
+; GFX10-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
+; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX10-NEXT: s_barrier
+; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: flat_store_dword v[0:1], v2
; GFX10-NEXT: s_endpgm
;
@@ -213,15 +212,15 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX11-NEXT: ds_load_b32 v1, v0
-; GFX11-NEXT: s_barrier
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v3, v1, v1
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
+; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX11-NEXT: s_barrier
+; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
index 97db15b..1d1d3e4 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
@@ -107,6 +107,7 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
@@ -120,7 +121,6 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i
; GFX12-NEXT: s_mov_b32 s3, s12
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen
; GFX12-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
index 9dac239..1e4b633 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
@@ -354,6 +354,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX9-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[10:11]
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX9-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30
; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
; GFX9-SDAG-NEXT: s_mov_b32 s5, s10
@@ -365,8 +366,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX9-SDAG-NEXT: s_mov_b32 s3, s10
; GFX9-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[10:11]
; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -384,6 +385,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX9-GISEL-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7]
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
; GFX9-GISEL-NEXT: s_load_dword s7, s[8:9], 0x30
; GFX9-GISEL-NEXT: s_mov_b32 s4, s11
@@ -395,8 +397,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX9-GISEL-NEXT: s_mov_b32 s10, s3
; GFX9-GISEL-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7]
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX942-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -414,6 +416,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX942-SDAG-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX942-SDAG-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX942-SDAG-NEXT: s_load_dword s13, s[4:5], 0x30
; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
; GFX942-SDAG-NEXT: s_mov_b32 s5, s12
@@ -425,8 +428,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX942-SDAG-NEXT: s_mov_b32 s3, s12
; GFX942-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX942-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX942-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -444,6 +447,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX942-GISEL-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11]
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX942-GISEL-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
; GFX942-GISEL-NEXT: s_load_dword s9, s[4:5], 0x30
; GFX942-GISEL-NEXT: s_mov_b32 s4, s7
@@ -455,8 +459,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX942-GISEL-NEXT: s_mov_b32 s6, s3
; GFX942-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX942-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -475,6 +479,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX10-SDAG-NEXT: s_mov_b32 s11, s2
; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[12:13], s[10:11]
; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: s_clause 0x1
; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30
; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
@@ -488,8 +493,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX10-SDAG-NEXT: s_mov_b32 s2, s1
; GFX10-SDAG-NEXT: s_mov_b32 s3, s10
; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[10:11]
-; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -508,6 +513,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX10-GISEL-NEXT: s_mov_b32 s6, s3
; GFX10-GISEL-NEXT: s_or_b64 s[2:3], s[6:7], s[4:5]
; GFX10-GISEL-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc dlc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
@@ -519,8 +525,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX10-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
; GFX10-GISEL-NEXT: s_mov_b32 s6, s3
; GFX10-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11]
-; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -541,6 +547,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
@@ -554,8 +561,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX11-SDAG-NEXT: s_mov_b32 s3, s12
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -576,6 +583,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7]
; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30
@@ -588,8 +596,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX11-GISEL-NEXT: s_mov_b32 s8, s3
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7]
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -610,6 +618,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
@@ -623,8 +632,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX12-SDAG-NEXT: s_mov_b32 s3, s12
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -645,6 +654,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7]
; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30
@@ -657,8 +667,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX12-GISEL-NEXT: s_mov_b32 s8, s3
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7]
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: s_endpgm
entry:
%val = load volatile i32, ptr addrspace(7) %in, !nontemporal !0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
index 516c3946..282a7ae 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
@@ -15,7 +15,6 @@ define amdgpu_kernel void @test_s_barrier() {
;
; GFX10-CU-LABEL: test_s_barrier:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-CU-NEXT: s_barrier
; GFX10-CU-NEXT: s_endpgm
;
@@ -26,7 +25,6 @@ define amdgpu_kernel void @test_s_barrier() {
;
; GFX11-CU-LABEL: test_s_barrier:
; GFX11-CU: ; %bb.0: ; %entry
-; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX11-CU-NEXT: s_barrier
; GFX11-CU-NEXT: s_endpgm
;
@@ -38,7 +36,6 @@ define amdgpu_kernel void @test_s_barrier() {
;
; GFX12-CU-LABEL: test_s_barrier:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: s_wait_alu 0xffe3
; GFX12-CU-NEXT: s_barrier_signal -1
; GFX12-CU-NEXT: s_barrier_wait -1
; GFX12-CU-NEXT: s_endpgm
@@ -63,8 +60,8 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
;
; GFX10-CU-LABEL: test_s_barrier_workgroup_fence:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_barrier
; GFX10-CU-NEXT: s_endpgm
;
@@ -77,8 +74,8 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
;
; GFX11-CU-LABEL: test_s_barrier_workgroup_fence:
; GFX11-CU: ; %bb.0: ; %entry
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_barrier
; GFX11-CU-NEXT: s_endpgm
;
@@ -94,8 +91,10 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
;
; GFX12-CU-LABEL: test_s_barrier_workgroup_fence:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
-; GFX12-CU-NEXT: s_wait_alu 0xffe3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_barrier_signal -1
; GFX12-CU-NEXT: s_barrier_wait -1
; GFX12-CU-NEXT: s_endpgm
@@ -125,7 +124,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() {
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-CU-NEXT: s_barrier
; GFX10-CU-NEXT: s_endpgm
;
@@ -140,7 +138,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() {
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX11-CU-NEXT: s_barrier
; GFX11-CU-NEXT: s_endpgm
;
@@ -160,7 +157,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() {
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-CU-NEXT: s_wait_alu 0xffe3
; GFX12-CU-NEXT: s_barrier_signal -1
; GFX12-CU-NEXT: s_barrier_wait -1
; GFX12-CU-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
index 6a76f43..7efbff9 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
@@ -107,6 +107,8 @@ define amdgpu_kernel void @workgroup_release_fence() {
;
; GFX10-CU-LABEL: workgroup_release_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_release_fence:
@@ -139,6 +141,8 @@ define amdgpu_kernel void @workgroup_release_fence() {
;
; GFX11-CU-LABEL: workgroup_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_release_fence:
@@ -151,6 +155,10 @@ define amdgpu_kernel void @workgroup_release_fence() {
;
; GFX12-CU-LABEL: workgroup_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_release_fence:
@@ -181,6 +189,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
;
; GFX10-CU-LABEL: workgroup_acq_rel_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_acq_rel_fence:
@@ -216,6 +226,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
;
; GFX11-CU-LABEL: workgroup_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_acq_rel_fence:
@@ -229,6 +241,10 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
;
; GFX12-CU-LABEL: workgroup_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_acq_rel_fence:
@@ -259,6 +275,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
;
; GFX10-CU-LABEL: workgroup_seq_cst_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_seq_cst_fence:
@@ -294,6 +312,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
;
; GFX11-CU-LABEL: workgroup_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_seq_cst_fence:
@@ -307,6 +327,10 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
;
; GFX12-CU-LABEL: workgroup_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_seq_cst_fence:
@@ -412,6 +436,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
;
; GFX10-CU-LABEL: workgroup_one_as_release_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence:
@@ -444,6 +470,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
;
; GFX11-CU-LABEL: workgroup_one_as_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_one_as_release_fence:
@@ -456,6 +484,10 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
;
; GFX12-CU-LABEL: workgroup_one_as_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_one_as_release_fence:
@@ -486,6 +518,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
;
; GFX10-CU-LABEL: workgroup_one_as_acq_rel_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence:
@@ -521,6 +555,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
;
; GFX11-CU-LABEL: workgroup_one_as_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence:
@@ -534,6 +570,10 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
;
; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_one_as_acq_rel_fence:
@@ -564,6 +604,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
;
; GFX10-CU-LABEL: workgroup_one_as_seq_cst_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence:
@@ -599,6 +641,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
;
; GFX11-CU-LABEL: workgroup_one_as_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence:
@@ -612,6 +656,10 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
;
; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_one_as_seq_cst_fence:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
index d288bfc..1cca64a 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
@@ -1093,7 +1093,8 @@ define amdgpu_kernel void @workgroup_release_fence() {
;
; GFX10-CU-LABEL: workgroup_release_fence:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_release_fence:
@@ -1129,7 +1130,8 @@ define amdgpu_kernel void @workgroup_release_fence() {
;
; GFX11-CU-LABEL: workgroup_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_release_fence:
@@ -1142,7 +1144,10 @@ define amdgpu_kernel void @workgroup_release_fence() {
;
; GFX12-CU-LABEL: workgroup_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_release_fence:
@@ -1175,7 +1180,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
;
; GFX10-CU-LABEL: workgroup_acq_rel_fence:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_acq_rel_fence:
@@ -1214,7 +1220,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
;
; GFX11-CU-LABEL: workgroup_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_acq_rel_fence:
@@ -1228,7 +1235,10 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
;
; GFX12-CU-LABEL: workgroup_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_acq_rel_fence:
@@ -1261,7 +1271,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
;
; GFX10-CU-LABEL: workgroup_seq_cst_fence:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_seq_cst_fence:
@@ -1300,7 +1311,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
;
; GFX11-CU-LABEL: workgroup_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_seq_cst_fence:
@@ -1314,7 +1326,10 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
;
; GFX12-CU-LABEL: workgroup_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_seq_cst_fence:
@@ -1420,6 +1435,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
;
; GFX10-CU-LABEL: workgroup_one_as_release_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence:
@@ -1452,6 +1469,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
;
; GFX11-CU-LABEL: workgroup_one_as_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_one_as_release_fence:
@@ -1464,6 +1483,10 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
;
; GFX12-CU-LABEL: workgroup_one_as_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_one_as_release_fence:
@@ -1494,6 +1517,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
;
; GFX10-CU-LABEL: workgroup_one_as_acq_rel_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence:
@@ -1529,6 +1554,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
;
; GFX11-CU-LABEL: workgroup_one_as_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence:
@@ -1542,6 +1569,10 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
;
; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_one_as_acq_rel_fence:
@@ -1572,6 +1603,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
;
; GFX10-CU-LABEL: workgroup_one_as_seq_cst_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence:
@@ -1607,6 +1640,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
;
; GFX11-CU-LABEL: workgroup_one_as_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence:
@@ -1620,6 +1655,10 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
;
; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_one_as_seq_cst_fence:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index d277441..2afa577 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -1072,7 +1072,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -1109,7 +1110,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -1136,7 +1138,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index 3826953..d384aec 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -656,12 +656,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -765,12 +765,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -800,12 +800,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -1193,7 +1195,8 @@ define amdgpu_kernel void @flat_workgroup_release_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -1278,7 +1281,8 @@ define amdgpu_kernel void @flat_workgroup_release_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -1305,7 +1309,10 @@ define amdgpu_kernel void @flat_workgroup_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -1372,7 +1379,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -1457,7 +1465,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -1484,7 +1493,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -1891,7 +1903,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -1976,7 +1989,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -2003,7 +2017,10 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -2074,9 +2091,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_atomicrmw:
@@ -2170,9 +2189,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
@@ -2200,9 +2221,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_acq_rel_atomicrmw:
@@ -2273,9 +2297,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_atomicrmw:
@@ -2369,9 +2395,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
@@ -2399,9 +2427,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_seq_cst_atomicrmw:
@@ -2697,12 +2728,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -2813,12 +2844,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -2850,12 +2881,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -2935,12 +2968,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -3051,12 +3084,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -3088,12 +3121,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -3731,7 +3766,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
@@ -3854,7 +3890,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -3889,7 +3926,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -4007,9 +4047,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4141,9 +4183,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4179,9 +4223,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4299,9 +4346,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4433,9 +4482,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4471,9 +4522,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
@@ -5137,9 +5191,11 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_release_acquire_cmpxchg:
@@ -5271,9 +5327,11 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
@@ -5309,9 +5367,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_release_acquire_cmpxchg:
@@ -5429,9 +5490,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
@@ -5563,9 +5626,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
@@ -5601,9 +5666,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
@@ -5721,9 +5789,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
@@ -5855,9 +5925,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
@@ -5893,9 +5965,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
@@ -6013,9 +6088,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6147,9 +6224,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6185,9 +6264,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6923,7 +7005,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
@@ -7070,7 +7153,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
@@ -7113,7 +7197,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
@@ -7245,12 +7332,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -7399,12 +7486,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -7444,12 +7531,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -7577,12 +7666,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -7731,12 +7820,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -7776,12 +7865,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -8535,12 +8626,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -8689,12 +8780,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -8734,12 +8825,14 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -8867,12 +8960,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -9021,12 +9114,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -9066,12 +9159,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -9199,12 +9294,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -9353,12 +9448,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -9398,12 +9493,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -9531,7 +9628,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9685,7 +9783,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9730,7 +9829,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9863,7 +9965,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -10017,7 +10120,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -10062,7 +10166,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -10195,12 +10302,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -10349,12 +10456,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -10394,12 +10501,14 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -10527,12 +10636,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -10681,12 +10790,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -10726,12 +10835,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -10859,12 +10970,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -11013,12 +11124,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -11058,12 +11169,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -11732,10 +11845,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -11834,10 +11950,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -11868,10 +11987,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -12258,6 +12382,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -12339,6 +12465,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -12365,6 +12493,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -12430,6 +12562,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -12511,6 +12645,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -12537,6 +12673,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -12933,6 +13073,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -13014,6 +13156,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -13040,6 +13184,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -13107,7 +13255,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
@@ -13194,7 +13345,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
@@ -13222,7 +13376,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
@@ -13290,7 +13449,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
@@ -13377,7 +13539,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
@@ -13405,7 +13570,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
@@ -13696,10 +13866,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -13805,10 +13978,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -13841,10 +14017,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -13923,10 +14104,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -14032,10 +14216,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -14068,10 +14255,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -14699,6 +14891,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
@@ -14818,6 +15012,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -14852,6 +15048,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -14966,7 +15166,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15091,7 +15294,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15127,7 +15333,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15242,7 +15453,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -15367,7 +15581,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -15403,7 +15620,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -16046,7 +16268,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
@@ -16171,7 +16396,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
@@ -16207,7 +16435,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
@@ -16322,7 +16555,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16447,7 +16683,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16483,7 +16722,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16598,7 +16842,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16723,7 +16970,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16759,7 +17009,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16874,6 +17129,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
@@ -16999,6 +17256,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -17035,6 +17294,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -17150,6 +17413,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
@@ -17275,6 +17540,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -17311,6 +17578,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -17426,7 +17697,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17551,7 +17825,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17587,7 +17864,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17702,7 +17984,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17827,7 +18112,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17863,7 +18151,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17978,7 +18271,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18103,7 +18399,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18139,7 +18438,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18870,6 +19174,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
@@ -19013,6 +19319,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
@@ -19055,6 +19363,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
@@ -19185,10 +19497,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -19332,10 +19647,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -19376,10 +19694,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -19506,10 +19829,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -19653,10 +19979,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -19697,10 +20026,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -20445,10 +20779,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -20592,10 +20929,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -20636,10 +20976,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -20766,10 +21111,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -20913,10 +21261,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -20957,10 +21308,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -21087,10 +21443,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -21234,10 +21593,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -21278,10 +21640,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -21408,6 +21775,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
@@ -21555,6 +21924,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
@@ -21599,6 +21970,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
@@ -21729,6 +22104,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
@@ -21876,6 +22253,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
@@ -21920,6 +22299,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
@@ -22050,10 +22433,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -22197,10 +22583,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -22241,10 +22630,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -22371,10 +22765,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -22518,10 +22915,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -22562,10 +22962,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -22692,10 +23097,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -22839,10 +23247,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -22883,10 +23294,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index 3bf5ed8..c326edf 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -959,7 +959,8 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
@@ -1001,7 +1002,8 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
@@ -1026,7 +1028,10 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index b755c5d..868b438 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -667,7 +667,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -763,7 +764,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -790,7 +792,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -1204,7 +1209,8 @@ define amdgpu_kernel void @global_workgroup_release_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
@@ -1290,7 +1296,8 @@ define amdgpu_kernel void @global_workgroup_release_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
@@ -1315,7 +1322,10 @@ define amdgpu_kernel void @global_workgroup_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
@@ -1391,7 +1401,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
@@ -1477,7 +1488,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
@@ -1502,7 +1514,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
@@ -1918,7 +1933,8 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
@@ -2003,7 +2019,8 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
@@ -2028,7 +2045,10 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
@@ -2105,8 +2125,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_atomicrmw:
@@ -2196,8 +2218,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_acq_rel_atomicrmw:
@@ -2223,8 +2247,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_acq_rel_atomicrmw:
@@ -2301,8 +2329,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_atomicrmw:
@@ -2392,8 +2422,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_seq_cst_atomicrmw:
@@ -2419,8 +2451,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_seq_cst_atomicrmw:
@@ -2705,7 +2741,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -2807,7 +2844,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -2837,7 +2875,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -2926,7 +2967,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -3028,7 +3070,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -3058,7 +3101,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -3644,7 +3690,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -3758,7 +3805,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -3791,7 +3839,10 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -3900,8 +3951,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4020,8 +4073,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4055,8 +4110,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4165,8 +4224,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4285,8 +4346,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4320,8 +4383,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4920,8 +4987,10 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_release_acquire_cmpxchg:
@@ -5040,8 +5109,10 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_release_acquire_cmpxchg:
@@ -5075,8 +5146,12 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_release_acquire_cmpxchg:
@@ -5185,8 +5260,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
@@ -5305,8 +5382,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
@@ -5340,8 +5419,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
@@ -5450,8 +5533,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
@@ -5570,8 +5655,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
@@ -5605,8 +5692,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
@@ -5715,7 +5806,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -5835,7 +5927,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -5870,7 +5963,10 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -5980,7 +6076,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -6100,7 +6197,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -6135,7 +6233,10 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -6245,8 +6346,10 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_release_seq_cst_cmpxchg:
@@ -6365,8 +6468,10 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg:
@@ -6400,8 +6505,12 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_release_seq_cst_cmpxchg:
@@ -6510,8 +6619,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
@@ -6630,8 +6741,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
@@ -6665,8 +6778,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
@@ -6775,8 +6892,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6895,8 +7014,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6930,8 +7051,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -7588,7 +7713,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -7717,7 +7843,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -7754,7 +7881,10 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -7877,7 +8007,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -8009,7 +8140,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -8047,7 +8179,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -8170,7 +8305,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -8302,7 +8438,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -8340,7 +8477,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -9009,7 +9149,8 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -9141,7 +9282,8 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -9179,7 +9321,10 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -9302,7 +9447,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -9434,7 +9580,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -9472,7 +9619,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -9595,7 +9745,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -9727,7 +9878,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -9765,7 +9917,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -9888,7 +10043,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -10020,7 +10176,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -10058,7 +10215,10 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -10181,7 +10341,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -10313,7 +10474,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -10351,7 +10513,10 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -10474,7 +10639,8 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -10606,7 +10772,8 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -10644,7 +10811,10 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -10767,7 +10937,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -10899,7 +11070,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -10937,7 +11109,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -11060,7 +11235,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -11192,7 +11368,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -11230,7 +11407,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -11914,7 +12094,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -12009,7 +12190,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -12036,6 +12218,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -12447,6 +12633,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
@@ -12529,6 +12717,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
@@ -12553,6 +12743,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
@@ -12626,6 +12820,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
@@ -12708,6 +12904,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
@@ -12732,6 +12930,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
@@ -13145,6 +13347,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
@@ -13226,6 +13430,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
@@ -13250,6 +13456,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
@@ -13324,7 +13534,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
@@ -13411,7 +13624,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
@@ -13437,7 +13653,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
@@ -13512,7 +13733,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
@@ -13599,7 +13823,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
@@ -13625,7 +13852,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
@@ -13908,6 +14140,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -14006,6 +14240,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -14035,6 +14271,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -14121,6 +14361,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -14219,6 +14461,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -14248,6 +14492,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -14831,6 +15079,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -14941,6 +15191,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -14973,6 +15225,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -15079,7 +15335,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15195,7 +15454,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15229,7 +15491,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15336,7 +15603,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -15452,7 +15722,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -15486,7 +15759,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -16083,7 +16361,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
@@ -16199,7 +16480,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
@@ -16233,7 +16517,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
@@ -16340,7 +16629,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16456,7 +16748,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16490,7 +16785,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16597,7 +16897,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16713,7 +17016,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16747,7 +17053,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16854,6 +17165,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -16970,6 +17283,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -17004,6 +17319,10 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -17111,6 +17430,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -17227,6 +17548,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -17261,6 +17584,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -17368,7 +17695,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17484,7 +17814,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17518,7 +17851,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17625,7 +17963,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17741,7 +18082,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17775,7 +18119,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17882,7 +18231,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -17998,7 +18350,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18032,7 +18387,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18687,6 +19047,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -18812,6 +19174,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -18848,6 +19212,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -18968,6 +19336,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -19096,6 +19466,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -19133,6 +19505,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -19253,6 +19629,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -19381,6 +19759,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -19418,6 +19798,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -20084,6 +20468,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -20212,6 +20598,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -20249,6 +20637,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -20369,6 +20761,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -20497,6 +20891,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -20534,6 +20930,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -20654,6 +21054,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -20782,6 +21184,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -20819,6 +21223,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -20939,6 +21347,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -21067,6 +21477,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -21104,6 +21516,10 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -21224,6 +21640,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -21352,6 +21770,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -21389,6 +21809,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -21509,6 +21933,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -21637,6 +22063,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -21674,6 +22102,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -21794,6 +22226,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -21922,6 +22356,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -21959,6 +22395,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -22079,6 +22519,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -22207,6 +22649,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -22244,6 +22688,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll
new file mode 100644
index 0000000..0b40c81
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll
@@ -0,0 +1,542 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx90a -global-isel=0 < %s | FileCheck --check-prefixes=GFX90A %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx942 -global-isel=0 < %s | FileCheck --check-prefixes=GFX942 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx942 -global-isel=1 < %s | FileCheck --check-prefixes=GFX942-GISEL %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -global-isel=0 < %s | FileCheck --check-prefixes=GFX10 %s
+
+define amdgpu_ps void @global_load_lds_dword_volatile(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_volatile:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: v_mov_b32_e32 v2, s1
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT: s_mov_b32 m0, s2
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_volatile:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT: s_mov_b32 m0, s2
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_volatile:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_volatile:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT: s_mov_b32 m0, s2
+; GFX10-NEXT: global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT: s_endpgm
+ %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_load_lds_dword_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_nontemporal:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: v_mov_b32_e32 v2, s1
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT: s_mov_b32 m0, s2
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT: global_load_dword v[0:1], off glc slc lds
+; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_nontemporal:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_mov_b32 m0, s2
+; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT: global_load_lds_dword v[0:1], off nt
+; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_nontemporal:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off nt
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_nontemporal:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT: s_mov_b32 m0, s2
+; GFX10-NEXT: global_load_dword v[0:1], off slc lds
+; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT: s_endpgm
+ %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_load_lds_dword_volatile_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_volatile_nontemporal:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: v_mov_b32_e32 v2, s1
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT: s_mov_b32 m0, s2
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_volatile_nontemporal:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT: s_mov_b32 m0, s2
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_volatile_nontemporal:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_volatile_nontemporal:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT: s_mov_b32 m0, s2
+; GFX10-NEXT: global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT: s_endpgm
+ %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648), !nontemporal !0
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @load_to_lds_p1_dword_volatile(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p1_dword_volatile:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: v_mov_b32_e32 v2, s1
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT: s_mov_b32 m0, s2
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p1_dword_volatile:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT: s_mov_b32 m0, s2
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p1_dword_volatile:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p1_dword_volatile:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT: s_mov_b32 m0, s2
+; GFX10-NEXT: global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT: s_endpgm
+ %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+ call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+ call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @load_to_lds_p1_dword_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p1_dword_nontemporal:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: v_mov_b32_e32 v2, s1
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT: s_mov_b32 m0, s2
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT: global_load_dword v[0:1], off glc slc lds
+; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p1_dword_nontemporal:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_mov_b32 m0, s2
+; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT: global_load_lds_dword v[0:1], off nt
+; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p1_dword_nontemporal:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off nt
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p1_dword_nontemporal:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT: s_mov_b32 m0, s2
+; GFX10-NEXT: global_load_dword v[0:1], off slc lds
+; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT: s_endpgm
+ %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+ call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+ call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @load_to_lds_p1_dword_volatile_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p1_dword_volatile_nontemporal:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: v_mov_b32_e32 v2, s1
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT: s_mov_b32 m0, s2
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p1_dword_volatile_nontemporal:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT: s_mov_b32 m0, s2
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p1_dword_volatile_nontemporal:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p1_dword_volatile_nontemporal:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT: s_mov_b32 m0, s2
+; GFX10-NEXT: global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT: s_endpgm
+ %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+ call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648), !nontemporal !0
+ call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @load_to_lds_p7_dword_volatile(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p7_dword_volatile:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT: s_mov_b32 m0, s5
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p7_dword_volatile:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT: s_mov_b32 m0, s5
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p7_dword_volatile:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p7_dword_volatile:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT: s_mov_b32 m0, s5
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT: s_endpgm
+ %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+ call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+ call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @load_to_lds_p7_dword_nontemporal(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p7_dword_nontemporal:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT: s_mov_b32 m0, s5
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc slc lds
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p7_dword_nontemporal:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT: s_mov_b32 m0, s5
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p7_dword_nontemporal:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p7_dword_nontemporal:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT: s_mov_b32 m0, s5
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen slc lds
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT: s_endpgm
+ %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+ call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+ call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @load_to_lds_p7_dword_volatile_nontemporal(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p7_dword_volatile_nontemporal:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT: s_mov_b32 m0, s5
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p7_dword_volatile_nontemporal:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT: s_mov_b32 m0, s5
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p7_dword_volatile_nontemporal:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p7_dword_volatile_nontemporal:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT: s_mov_b32 m0, s5
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT: s_endpgm
+ %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+ call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648), !nontemporal !0
+ call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @raw_ptr_buffer_load_lds_dword_volatile(ptr addrspace(8) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: raw_ptr_buffer_load_lds_dword_volatile:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_mov_b32 m0, s4
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: raw_ptr_buffer_load_lds_dword_volatile:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_mov_b32 m0, s4
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: raw_ptr_buffer_load_lds_dword_volatile:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s4
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX10-LABEL: raw_ptr_buffer_load_lds_dword_volatile:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_mov_b32 m0, s4
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT: s_endpgm
+ call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 0, i32 2147483648)
+ call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 512, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @raw_ptr_buffer_load_lds_dword_nontemporal(ptr addrspace(8) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_mov_b32 m0, s4
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc slc lds
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_mov_b32 m0, s4
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s4
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX10-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_mov_b32 m0, s4
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen slc lds
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT: s_endpgm
+ call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 0, i32 0), !nontemporal !0
+ call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 512, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @raw_ptr_buffer_load_lds_dword_volatile_nontemporal(ptr addrspace(8) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_mov_b32 m0, s4
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_mov_b32 m0, s4
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s4
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX10-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_mov_b32 m0, s4
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT: s_endpgm
+ call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 0, i32 2147483648), !nontemporal !0
+ call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 512, i32 0)
+ ret void
+}
+
+!0 = !{i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
index 986b48b..712109d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
@@ -622,7 +622,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_read_b32 v1, v0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -719,7 +720,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_load_b32 v1, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -749,7 +751,10 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_load_b32 v1, v0
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_agent_release_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_agent_release_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_agent_release_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: s_wait_dscnt 0x0
@@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll
index 8926893..6d1e4e6 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll
@@ -622,7 +622,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_load(
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_read_b32 v1, v0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -719,7 +720,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_load(
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_load_b32 v1, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -749,7 +751,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_load(
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_load_b32 v1, v0
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_cluster_release_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_cluster_release_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_cluster_release_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: s_wait_dscnt 0x0
@@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
index 81bbe0a..577d2ca 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
@@ -622,7 +622,8 @@ define amdgpu_kernel void @local_system_seq_cst_load(
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_read_b32 v1, v0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -719,7 +720,8 @@ define amdgpu_kernel void @local_system_seq_cst_load(
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_load_b32 v1, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -749,7 +751,10 @@ define amdgpu_kernel void @local_system_seq_cst_load(
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_load_b32 v1, v0
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_system_release_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_system_release_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_system_release_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_system_seq_cst_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_system_seq_cst_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_system_seq_cst_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: s_wait_dscnt 0x0
@@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index 980141a..d686e7a 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -819,7 +819,8 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -854,7 +855,8 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -879,7 +881,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
index 6a233a2..ab4d783 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
@@ -622,7 +622,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_read_b32 v1, v0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -719,7 +720,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_load_b32 v1, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -749,7 +751,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_load_b32 v1, v0
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_workgroup_release_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_workgroup_release_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_workgroup_release_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: s_wait_dscnt 0x0
@@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir
new file mode 100644
index 0000000..93f7bcc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir
@@ -0,0 +1,83 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -passes=postmisched -o - %s | FileCheck %s
+
+# Ensure WMMA operations stay before the final atomic fence and barrier group.
+# This allows the latency of the WMMA operations to be hidden by barrier wait.
+---
+name: test
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr12, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8, $vgpr9_vgpr10_vgpr11_vgpr12, $vgpr13_vgpr14_vgpr15_vgpr16, $vgpr17_vgpr18_vgpr19_vgpr20, $vgpr21_vgpr22_vgpr23_vgpr24, $vgpr25_vgpr26_vgpr27_vgpr28, $vgpr29_vgpr30_vgpr31_vgpr32
+
+ ; CHECK-LABEL: name: test
+ ; CHECK: liveins: $sgpr0, $sgpr12, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8, $vgpr9_vgpr10_vgpr11_vgpr12, $vgpr13_vgpr14_vgpr15_vgpr16, $vgpr17_vgpr18_vgpr19_vgpr20, $vgpr21_vgpr22_vgpr23_vgpr24, $vgpr25_vgpr26_vgpr27_vgpr28, $vgpr29_vgpr30_vgpr31_vgpr32
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: ATOMIC_FENCE 5, 2
+ ; CHECK-NEXT: S_BARRIER
+ ; CHECK-NEXT: ATOMIC_FENCE 4, 2
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr41_vgpr42_vgpr43_vgpr44, implicit-def $vgpr41, implicit-def $vgpr41_lo16, implicit-def $vgpr41_hi16, implicit-def $vgpr42, implicit-def $vgpr42_lo16, implicit-def $vgpr42_hi16, implicit-def $vgpr43, implicit-def $vgpr43_lo16, implicit-def $vgpr43_hi16, implicit-def $vgpr44, implicit-def $vgpr44_lo16, implicit-def $vgpr44_hi16, implicit-def $vgpr41_vgpr42, implicit-def $vgpr41_vgpr42_vgpr43, implicit-def $vgpr42_vgpr43, implicit-def $vgpr42_vgpr43_vgpr44, implicit-def $vgpr43_vgpr44, implicit-def $vgpr45_vgpr46_vgpr47_vgpr48, implicit-def $vgpr45, implicit-def $vgpr45_lo16, implicit-def $vgpr45_hi16, implicit-def $vgpr46, implicit-def $vgpr46_lo16, implicit-def $vgpr46_hi16, implicit-def $vgpr47, implicit-def $vgpr47_lo16, implicit-def $vgpr47_hi16, implicit-def $vgpr48, implicit-def $vgpr48_lo16, implicit-def $vgpr48_hi16, implicit-def $vgpr45_vgpr46, implicit-def $vgpr45_vgpr46_vgpr47, implicit-def $vgpr46_vgpr47, implicit-def $vgpr46_vgpr47_vgpr48, implicit-def $vgpr47_vgpr48, implicit-def $vgpr49_vgpr50_vgpr51_vgpr52, implicit-def $vgpr49, implicit-def $vgpr49_lo16, implicit-def $vgpr49_hi16, implicit-def $vgpr50, implicit-def $vgpr50_lo16, implicit-def $vgpr50_hi16, implicit-def $vgpr51, implicit-def $vgpr51_lo16, implicit-def $vgpr51_hi16, implicit-def $vgpr52, implicit-def $vgpr52_lo16, implicit-def $vgpr52_hi16, implicit-def $vgpr49_vgpr50, implicit-def $vgpr49_vgpr50_vgpr51, implicit-def $vgpr50_vgpr51, implicit-def $vgpr50_vgpr51_vgpr52, implicit-def $vgpr51_vgpr52, implicit-def $vgpr53_vgpr54_vgpr55_vgpr56, implicit-def $vgpr53, implicit-def $vgpr53_lo16, implicit-def $vgpr53_hi16, implicit-def $vgpr54, implicit-def $vgpr54_lo16, implicit-def $vgpr54_hi16, implicit-def $vgpr55, implicit-def $vgpr55_lo16, implicit-def $vgpr55_hi16, implicit-def $vgpr56, implicit-def $vgpr56_lo16, implicit-def $vgpr56_hi16, implicit-def $vgpr53_vgpr54, implicit-def $vgpr53_vgpr54_vgpr55, implicit-def $vgpr54_vgpr55, implicit-def $vgpr54_vgpr55_vgpr56, implicit-def $vgpr55_vgpr56, implicit-def $vgpr57_vgpr58_vgpr59_vgpr60, implicit-def $vgpr57, implicit-def $vgpr57_lo16, implicit-def $vgpr57_hi16, implicit-def $vgpr58, implicit-def $vgpr58_lo16, implicit-def $vgpr58_hi16, implicit-def $vgpr59, implicit-def $vgpr59_lo16, implicit-def $vgpr59_hi16, implicit-def $vgpr60, implicit-def $vgpr60_lo16, implicit-def $vgpr60_hi16, implicit-def $vgpr57_vgpr58, implicit-def $vgpr57_vgpr58_vgpr59, implicit-def $vgpr58_vgpr59, implicit-def $vgpr58_vgpr59_vgpr60, implicit-def $vgpr59_vgpr60, implicit-def $vgpr61_vgpr62_vgpr63_vgpr64, implicit-def $vgpr61, implicit-def $vgpr61_lo16, implicit-def $vgpr61_hi16, implicit-def $vgpr62, implicit-def $vgpr62_lo16, implicit-def $vgpr62_hi16, implicit-def $vgpr63, implicit-def $vgpr63_lo16, implicit-def $vgpr63_hi16, implicit-def $vgpr64, implicit-def $vgpr64_lo16, implicit-def $vgpr64_hi16, implicit-def $vgpr61_vgpr62, implicit-def $vgpr61_vgpr62_vgpr63, implicit-def $vgpr62_vgpr63, implicit-def $vgpr62_vgpr63_vgpr64, implicit-def $vgpr63_vgpr64, implicit-def $vgpr65_vgpr66_vgpr67_vgpr68, implicit-def $vgpr65, implicit-def $vgpr65_lo16, implicit-def $vgpr65_hi16, implicit-def $vgpr66, implicit-def $vgpr66_lo16, implicit-def $vgpr66_hi16, implicit-def $vgpr67, implicit-def $vgpr67_lo16, implicit-def $vgpr67_hi16, implicit-def $vgpr68, implicit-def $vgpr68_lo16, implicit-def $vgpr68_hi16, implicit-def $vgpr65_vgpr66, implicit-def $vgpr65_vgpr66_vgpr67, implicit-def $vgpr66_vgpr67, implicit-def $vgpr66_vgpr67_vgpr68, implicit-def $vgpr67_vgpr68, implicit-def $vgpr69_vgpr70_vgpr71_vgpr72, implicit-def $vgpr69, implicit-def $vgpr69_lo16, implicit-def $vgpr69_hi16, implicit-def $vgpr70, implicit-def $vgpr70_lo16, implicit-def $vgpr70_hi16, implicit-def $vgpr71, implicit-def $vgpr71_lo16, implicit-def $vgpr71_hi16, implicit-def $vgpr72, implicit-def $vgpr72_lo16, implicit-def $vgpr72_hi16, implicit-def $vgpr69_vgpr70, implicit-def $vgpr69_vgpr70_vgpr71, implicit-def $vgpr70_vgpr71, implicit-def $vgpr70_vgpr71_vgpr72, implicit-def $vgpr71_vgpr72, implicit-def $vgpr73_vgpr74_vgpr75_vgpr76, implicit-def $vgpr73, implicit-def $vgpr73_lo16, implicit-def $vgpr73_hi16, implicit-def $vgpr74, implicit-def $vgpr74_lo16, implicit-def $vgpr74_hi16, implicit-def $vgpr75, implicit-def $vgpr75_lo16, implicit-def $vgpr75_hi16, implicit-def $vgpr76, implicit-def $vgpr76_lo16, implicit-def $vgpr76_hi16, implicit-def $vgpr73_vgpr74, implicit-def $vgpr73_vgpr74_vgpr75, implicit-def $vgpr74_vgpr75, implicit-def $vgpr74_vgpr75_vgpr76, implicit-def $vgpr75_vgpr76, implicit-def $vgpr77_vgpr78_vgpr79_vgpr80, implicit-def $vgpr77, implicit-def $vgpr77_lo16, implicit-def $vgpr77_hi16, implicit-def $vgpr78, implicit-def $vgpr78_lo16, implicit-def $vgpr78_hi16, implicit-def $vgpr79, implicit-def $vgpr79_lo16, implicit-def $vgpr79_hi16, implicit-def $vgpr80, implicit-def $vgpr80_lo16, implicit-def $vgpr80_hi16, implicit-def $vgpr77_vgpr78, implicit-def $vgpr77_vgpr78_vgpr79, implicit-def $vgpr78_vgpr79, implicit-def $vgpr78_vgpr79_vgpr80, implicit-def $vgpr79_vgpr80, implicit-def $vgpr81_vgpr82_vgpr83_vgpr84, implicit-def $vgpr81, implicit-def $vgpr81_lo16, implicit-def $vgpr81_hi16, implicit-def $vgpr82, implicit-def $vgpr82_lo16, implicit-def $vgpr82_hi16, implicit-def $vgpr83, implicit-def $vgpr83_lo16, implicit-def $vgpr83_hi16, implicit-def $vgpr84, implicit-def $vgpr84_lo16, implicit-def $vgpr84_hi16, implicit-def $vgpr81_vgpr82, implicit-def $vgpr81_vgpr82_vgpr83, implicit-def $vgpr82_vgpr83, implicit-def $vgpr82_vgpr83_vgpr84, implicit-def $vgpr83_vgpr84, implicit-def $vgpr85_vgpr86_vgpr87_vgpr88, implicit-def $vgpr85, implicit-def $vgpr85_lo16, implicit-def $vgpr85_hi16, implicit-def $vgpr86, implicit-def $vgpr86_lo16, implicit-def $vgpr86_hi16, implicit-def $vgpr87, implicit-def $vgpr87_lo16, implicit-def $vgpr87_hi16, implicit-def $vgpr88, implicit-def $vgpr88_lo16, implicit-def $vgpr88_hi16, implicit-def $vgpr85_vgpr86, implicit-def $vgpr85_vgpr86_vgpr87, implicit-def $vgpr86_vgpr87, implicit-def $vgpr86_vgpr87_vgpr88, implicit-def $vgpr87_vgpr88, implicit killed $vgpr36, implicit $exec, implicit killed $vgpr37, implicit killed $vgpr38 {
+ ; CHECK-NEXT: $vgpr41_vgpr42_vgpr43_vgpr44 = DS_READ_B128_gfx9 $vgpr36, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr45_vgpr46_vgpr47_vgpr48 = DS_READ2_B64_gfx9 $vgpr36, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr49_vgpr50_vgpr51_vgpr52 = DS_READ_B128_gfx9 $vgpr37, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr53_vgpr54_vgpr55_vgpr56 = DS_READ2_B64_gfx9 $vgpr37, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr57_vgpr58_vgpr59_vgpr60 = DS_READ_B128_gfx9 $vgpr37, 768, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr61_vgpr62_vgpr63_vgpr64 = DS_READ2_B64_gfx9 killed $vgpr37, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr65_vgpr66_vgpr67_vgpr68 = DS_READ_B128_gfx9 $vgpr36, 768, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr69_vgpr70_vgpr71_vgpr72 = DS_READ2_B64_gfx9 $vgpr36, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr73_vgpr74_vgpr75_vgpr76 = DS_READ_B128_gfx9 $vgpr36, 1536, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr77_vgpr78_vgpr79_vgpr80 = DS_READ2_B64_gfx9 $vgpr36, 194, 195, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr81_vgpr82_vgpr83_vgpr84 = DS_READ_B128_gfx9 killed $vgpr36, 2304, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr85_vgpr86_vgpr87_vgpr88 = DS_READ2_B64_gfx9 killed $vgpr38, 0, 1, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: $sgpr1 = S_ADD_I32 $sgpr0, 16, implicit-def dead $scc
+ ; CHECK-NEXT: $vgpr39 = V_ADD_U32_e32 32, killed $vgpr39, implicit $exec
+ ; CHECK-NEXT: $vgpr40 = V_ADD_U32_e32 32, killed $vgpr40, implicit $exec
+ ; CHECK-NEXT: S_CMP_LT_U32 killed $sgpr0, killed $sgpr12, implicit-def $scc
+ ; CHECK-NEXT: $sgpr0 = S_MOV_B32 killed $sgpr1
+ ; CHECK-NEXT: early-clobber $vgpr29_vgpr30_vgpr31_vgpr32 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr29_vgpr30_vgpr31_vgpr32, 0, 0, implicit $exec
+ ; CHECK-NEXT: early-clobber $vgpr25_vgpr26_vgpr27_vgpr28 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr25_vgpr26_vgpr27_vgpr28, 0, 0, implicit $exec
+ ; CHECK-NEXT: early-clobber $vgpr21_vgpr22_vgpr23_vgpr24 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr21_vgpr22_vgpr23_vgpr24, 0, 0, implicit $exec
+ ; CHECK-NEXT: early-clobber $vgpr17_vgpr18_vgpr19_vgpr20 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr17_vgpr18_vgpr19_vgpr20, 0, 0, implicit $exec
+ ; CHECK-NEXT: early-clobber $vgpr13_vgpr14_vgpr15_vgpr16 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr13_vgpr14_vgpr15_vgpr16, 0, 0, implicit $exec
+ ; CHECK-NEXT: early-clobber $vgpr9_vgpr10_vgpr11_vgpr12 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr9_vgpr10_vgpr11_vgpr12, 0, 0, implicit $exec
+ ; CHECK-NEXT: early-clobber $vgpr5_vgpr6_vgpr7_vgpr8 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr5_vgpr6_vgpr7_vgpr8, 0, 0, implicit $exec
+ ; CHECK-NEXT: early-clobber $vgpr1_vgpr2_vgpr3_vgpr4 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr1_vgpr2_vgpr3_vgpr4, 0, 0, implicit $exec
+ ; CHECK-NEXT: ATOMIC_FENCE 5, 2
+ ; CHECK-NEXT: S_BARRIER
+ ; CHECK-NEXT: ATOMIC_FENCE 4, 2
+ ATOMIC_FENCE 5, 2
+ S_BARRIER
+ ATOMIC_FENCE 4, 2
+ BUNDLE implicit-def $vgpr41_vgpr42_vgpr43_vgpr44, implicit-def $vgpr41, implicit-def $vgpr41_lo16, implicit-def $vgpr41_hi16, implicit-def $vgpr42, implicit-def $vgpr42_lo16, implicit-def $vgpr42_hi16, implicit-def $vgpr43, implicit-def $vgpr43_lo16, implicit-def $vgpr43_hi16, implicit-def $vgpr44, implicit-def $vgpr44_lo16, implicit-def $vgpr44_hi16, implicit-def $vgpr41_vgpr42, implicit-def $vgpr41_vgpr42_vgpr43, implicit-def $vgpr42_vgpr43, implicit-def $vgpr42_vgpr43_vgpr44, implicit-def $vgpr43_vgpr44, implicit-def $vgpr45_vgpr46_vgpr47_vgpr48, implicit-def $vgpr45, implicit-def $vgpr45_lo16, implicit-def $vgpr45_hi16, implicit-def $vgpr46, implicit-def $vgpr46_lo16, implicit-def $vgpr46_hi16, implicit-def $vgpr47, implicit-def $vgpr47_lo16, implicit-def $vgpr47_hi16, implicit-def $vgpr48, implicit-def $vgpr48_lo16, implicit-def $vgpr48_hi16, implicit-def $vgpr45_vgpr46, implicit-def $vgpr45_vgpr46_vgpr47, implicit-def $vgpr46_vgpr47, implicit-def $vgpr46_vgpr47_vgpr48, implicit-def $vgpr47_vgpr48, implicit-def $vgpr49_vgpr50_vgpr51_vgpr52, implicit-def $vgpr49, implicit-def $vgpr49_lo16, implicit-def $vgpr49_hi16, implicit-def $vgpr50, implicit-def $vgpr50_lo16, implicit-def $vgpr50_hi16, implicit-def $vgpr51, implicit-def $vgpr51_lo16, implicit-def $vgpr51_hi16, implicit-def $vgpr52, implicit-def $vgpr52_lo16, implicit-def $vgpr52_hi16, implicit-def $vgpr49_vgpr50, implicit-def $vgpr49_vgpr50_vgpr51, implicit-def $vgpr50_vgpr51, implicit-def $vgpr50_vgpr51_vgpr52, implicit-def $vgpr51_vgpr52, implicit-def $vgpr53_vgpr54_vgpr55_vgpr56, implicit-def $vgpr53, implicit-def $vgpr53_lo16, implicit-def $vgpr53_hi16, implicit-def $vgpr54, implicit-def $vgpr54_lo16, implicit-def $vgpr54_hi16, implicit-def $vgpr55, implicit-def $vgpr55_lo16, implicit-def $vgpr55_hi16, implicit-def $vgpr56, implicit-def $vgpr56_lo16, implicit-def $vgpr56_hi16, implicit-def $vgpr53_vgpr54, implicit-def $vgpr53_vgpr54_vgpr55, implicit-def $vgpr54_vgpr55, implicit-def $vgpr54_vgpr55_vgpr56, implicit-def $vgpr55_vgpr56, implicit-def $vgpr57_vgpr58_vgpr59_vgpr60, implicit-def $vgpr57, implicit-def $vgpr57_lo16, implicit-def $vgpr57_hi16, implicit-def $vgpr58, implicit-def $vgpr58_lo16, implicit-def $vgpr58_hi16, implicit-def $vgpr59, implicit-def $vgpr59_lo16, implicit-def $vgpr59_hi16, implicit-def $vgpr60, implicit-def $vgpr60_lo16, implicit-def $vgpr60_hi16, implicit-def $vgpr57_vgpr58, implicit-def $vgpr57_vgpr58_vgpr59, implicit-def $vgpr58_vgpr59, implicit-def $vgpr58_vgpr59_vgpr60, implicit-def $vgpr59_vgpr60, implicit-def $vgpr61_vgpr62_vgpr63_vgpr64, implicit-def $vgpr61, implicit-def $vgpr61_lo16, implicit-def $vgpr61_hi16, implicit-def $vgpr62, implicit-def $vgpr62_lo16, implicit-def $vgpr62_hi16, implicit-def $vgpr63, implicit-def $vgpr63_lo16, implicit-def $vgpr63_hi16, implicit-def $vgpr64, implicit-def $vgpr64_lo16, implicit-def $vgpr64_hi16, implicit-def $vgpr61_vgpr62, implicit-def $vgpr61_vgpr62_vgpr63, implicit-def $vgpr62_vgpr63, implicit-def $vgpr62_vgpr63_vgpr64, implicit-def $vgpr63_vgpr64, implicit-def $vgpr65_vgpr66_vgpr67_vgpr68, implicit-def $vgpr65, implicit-def $vgpr65_lo16, implicit-def $vgpr65_hi16, implicit-def $vgpr66, implicit-def $vgpr66_lo16, implicit-def $vgpr66_hi16, implicit-def $vgpr67, implicit-def $vgpr67_lo16, implicit-def $vgpr67_hi16, implicit-def $vgpr68, implicit-def $vgpr68_lo16, implicit-def $vgpr68_hi16, implicit-def $vgpr65_vgpr66, implicit-def $vgpr65_vgpr66_vgpr67, implicit-def $vgpr66_vgpr67, implicit-def $vgpr66_vgpr67_vgpr68, implicit-def $vgpr67_vgpr68, implicit-def $vgpr69_vgpr70_vgpr71_vgpr72, implicit-def $vgpr69, implicit-def $vgpr69_lo16, implicit-def $vgpr69_hi16, implicit-def $vgpr70, implicit-def $vgpr70_lo16, implicit-def $vgpr70_hi16, implicit-def $vgpr71, implicit-def $vgpr71_lo16, implicit-def $vgpr71_hi16, implicit-def $vgpr72, implicit-def $vgpr72_lo16, implicit-def $vgpr72_hi16, implicit-def $vgpr69_vgpr70, implicit-def $vgpr69_vgpr70_vgpr71, implicit-def $vgpr70_vgpr71, implicit-def $vgpr70_vgpr71_vgpr72, implicit-def $vgpr71_vgpr72, implicit-def $vgpr73_vgpr74_vgpr75_vgpr76, implicit-def $vgpr73, implicit-def $vgpr73_lo16, implicit-def $vgpr73_hi16, implicit-def $vgpr74, implicit-def $vgpr74_lo16, implicit-def $vgpr74_hi16, implicit-def $vgpr75, implicit-def $vgpr75_lo16, implicit-def $vgpr75_hi16, implicit-def $vgpr76, implicit-def $vgpr76_lo16, implicit-def $vgpr76_hi16, implicit-def $vgpr73_vgpr74, implicit-def $vgpr73_vgpr74_vgpr75, implicit-def $vgpr74_vgpr75, implicit-def $vgpr74_vgpr75_vgpr76, implicit-def $vgpr75_vgpr76, implicit-def $vgpr77_vgpr78_vgpr79_vgpr80, implicit-def $vgpr77, implicit-def $vgpr77_lo16, implicit-def $vgpr77_hi16, implicit-def $vgpr78, implicit-def $vgpr78_lo16, implicit-def $vgpr78_hi16, implicit-def $vgpr79, implicit-def $vgpr79_lo16, implicit-def $vgpr79_hi16, implicit-def $vgpr80, implicit-def $vgpr80_lo16, implicit-def $vgpr80_hi16, implicit-def $vgpr77_vgpr78, implicit-def $vgpr77_vgpr78_vgpr79, implicit-def $vgpr78_vgpr79, implicit-def $vgpr78_vgpr79_vgpr80, implicit-def $vgpr79_vgpr80, implicit-def $vgpr81_vgpr82_vgpr83_vgpr84, implicit-def $vgpr81, implicit-def $vgpr81_lo16, implicit-def $vgpr81_hi16, implicit-def $vgpr82, implicit-def $vgpr82_lo16, implicit-def $vgpr82_hi16, implicit-def $vgpr83, implicit-def $vgpr83_lo16, implicit-def $vgpr83_hi16, implicit-def $vgpr84, implicit-def $vgpr84_lo16, implicit-def $vgpr84_hi16, implicit-def $vgpr81_vgpr82, implicit-def $vgpr81_vgpr82_vgpr83, implicit-def $vgpr82_vgpr83, implicit-def $vgpr82_vgpr83_vgpr84, implicit-def $vgpr83_vgpr84, implicit-def $vgpr85_vgpr86_vgpr87_vgpr88, implicit-def $vgpr85, implicit-def $vgpr85_lo16, implicit-def $vgpr85_hi16, implicit-def $vgpr86, implicit-def $vgpr86_lo16, implicit-def $vgpr86_hi16, implicit-def $vgpr87, implicit-def $vgpr87_lo16, implicit-def $vgpr87_hi16, implicit-def $vgpr88, implicit-def $vgpr88_lo16, implicit-def $vgpr88_hi16, implicit-def $vgpr85_vgpr86, implicit-def $vgpr85_vgpr86_vgpr87, implicit-def $vgpr86_vgpr87, implicit-def $vgpr86_vgpr87_vgpr88, implicit-def $vgpr87_vgpr88, implicit $vgpr36, implicit $exec, implicit $vgpr37, implicit $vgpr38 {
+ $vgpr41_vgpr42_vgpr43_vgpr44 = DS_READ_B128_gfx9 $vgpr36, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr45_vgpr46_vgpr47_vgpr48 = DS_READ2_B64_gfx9 $vgpr36, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr49_vgpr50_vgpr51_vgpr52 = DS_READ_B128_gfx9 $vgpr37, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr53_vgpr54_vgpr55_vgpr56 = DS_READ2_B64_gfx9 $vgpr37, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr57_vgpr58_vgpr59_vgpr60 = DS_READ_B128_gfx9 $vgpr37, 768, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr61_vgpr62_vgpr63_vgpr64 = DS_READ2_B64_gfx9 $vgpr37, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr65_vgpr66_vgpr67_vgpr68 = DS_READ_B128_gfx9 $vgpr36, 768, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr69_vgpr70_vgpr71_vgpr72 = DS_READ2_B64_gfx9 $vgpr36, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr73_vgpr74_vgpr75_vgpr76 = DS_READ_B128_gfx9 $vgpr36, 1536, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr77_vgpr78_vgpr79_vgpr80 = DS_READ2_B64_gfx9 $vgpr36, 194, 195, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr81_vgpr82_vgpr83_vgpr84 = DS_READ_B128_gfx9 $vgpr36, 2304, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr85_vgpr86_vgpr87_vgpr88 = DS_READ2_B64_gfx9 $vgpr38, 0, 1, 0, implicit $exec :: (load (s128), addrspace 3)
+ }
+ $sgpr1 = S_ADD_I32 $sgpr0, 16, implicit-def dead $scc
+ $vgpr39 = V_ADD_U32_e32 32, killed $vgpr39, implicit $exec
+ $vgpr40 = V_ADD_U32_e32 32, killed $vgpr40, implicit $exec
+ S_CMP_LT_U32 killed $sgpr0, $sgpr12, implicit-def $scc
+ early-clobber $vgpr29_vgpr30_vgpr31_vgpr32 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr29_vgpr30_vgpr31_vgpr32, 0, 0, implicit $exec
+ early-clobber $vgpr25_vgpr26_vgpr27_vgpr28 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr25_vgpr26_vgpr27_vgpr28, 0, 0, implicit $exec
+ early-clobber $vgpr21_vgpr22_vgpr23_vgpr24 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr21_vgpr22_vgpr23_vgpr24, 0, 0, implicit $exec
+ early-clobber $vgpr17_vgpr18_vgpr19_vgpr20 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr17_vgpr18_vgpr19_vgpr20, 0, 0, implicit $exec
+ early-clobber $vgpr13_vgpr14_vgpr15_vgpr16 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr13_vgpr14_vgpr15_vgpr16, 0, 0, implicit $exec
+ early-clobber $vgpr9_vgpr10_vgpr11_vgpr12 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr9_vgpr10_vgpr11_vgpr12, 0, 0, implicit $exec
+ early-clobber $vgpr5_vgpr6_vgpr7_vgpr8 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr5_vgpr6_vgpr7_vgpr8, 0, 0, implicit $exec
+ early-clobber $vgpr1_vgpr2_vgpr3_vgpr4 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr1_vgpr2_vgpr3_vgpr4, 0, 0, implicit $exec
+ $sgpr0 = S_MOV_B32 killed $sgpr1
+ ATOMIC_FENCE 5, 2
+ S_BARRIER
+ ATOMIC_FENCE 4, 2
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
index f3cb5a7..30f5277 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
@@ -26,17 +26,17 @@ define amdgpu_kernel void @barrier_vmcnt_global(ptr addrspace(1) %arg) {
; GFX9-LABEL: barrier_vmcnt_global:
; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v1, s[0:1]
-; GFX9-NEXT: v_add_u32_e32 v1, 1, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: global_load_dword v3, v1, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_barrier
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: global_store_dword v[0:1], v3, off
; GFX9-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -369,10 +369,9 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) {
; GFX8-NEXT: flat_load_dword v3, v[2:3]
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0
; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
-; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_barrier
; GFX8-NEXT: flat_store_dword v[0:1], v3
; GFX8-NEXT: s_endpgm
@@ -393,10 +392,9 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) {
; GFX9-NEXT: flat_load_dword v3, v[2:3]
; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_barrier
; GFX9-NEXT: flat_store_dword v[0:1], v3
; GFX9-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/BPF/BTF/variant-part.ll b/llvm/test/CodeGen/BPF/BTF/variant-part.ll
new file mode 100644
index 0000000..1071e61
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/variant-part.ll
@@ -0,0 +1,87 @@
+; RUN: llc -mtriple=bpfel -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+; RUN: llc -mtriple=bpfeb -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+;
+; Source:
+; #![no_std]
+; #![no_main]
+;
+; pub enum MyEnum {
+; First { a: u32, b: i32 },
+; Second(u32),
+; }
+;
+; #[unsafe(no_mangle)]
+; pub static X: MyEnum = MyEnum::First { a: 54, b: -23 };
+;
+; #[cfg(not(test))]
+; #[panic_handler]
+; fn panic(_info: &core::panic::PanicInfo) -> ! {
+; loop {}
+; }
+; Compilation flag:
+; cargo +nightly rustc -Zbuild-std=core --target=bpfel-unknown-none -- --emit=llvm-bc
+; llvm-extract --glob=X $(find target/ -name "*.bc" | head -n 1) -o variant-part.bc
+; llvm-dis variant-part.bc -o variant-part.ll
+
+; ModuleID = 'variant-part.bc'
+source_filename = "c0znihgkvro8hs0n88fgrtg6x"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "bpfel"
+
+@X = constant [12 x i8] c"\00\00\00\006\00\00\00\E9\FF\FF\FF", align 4, !dbg !0
+
+!llvm.module.flags = !{!22, !23, !24, !25}
+!llvm.ident = !{!26}
+!llvm.dbg.cu = !{!27}
+
+; CHECK-BTF: [1] STRUCT 'MyEnum' size=12 vlen=1
+; CHECK-BTF-NEXT: '(anon)' type_id=3 bits_offset=0
+; CHECK-BTF-NEXT: [2] INT 'u32' size=4 bits_offset=0 nr_bits=32 encoding=(none)
+; CHECK-BTF-NEXT: [3] UNION '(anon)' size=12 vlen=3
+; CHECK-BTF-NEXT: '(anon)' type_id=2 bits_offset=0
+; CHECK-BTF-NEXT: 'First' type_id=4 bits_offset=0
+; CHECK-BTF-NEXT: 'Second' type_id=6 bits_offset=0
+; CHECK-BTF-NEXT: [4] STRUCT 'First' size=12 vlen=2
+; CHECK-BTF-NEXT: 'a' type_id=2 bits_offset=32
+; CHECK-BTF-NEXT: 'b' type_id=5 bits_offset=64
+; CHECK-BTF-NEXT: [5] INT 'i32' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED
+; CHECK-BTF-NEXT: [6] STRUCT 'Second' size=12 vlen=1
+; CHECK-BTF-NEXT: '__0' type_id=2 bits_offset=32
+; CHECK-BTF-NEXT: [7] VAR 'X' type_id=1, linkage=global
+; CHECK-BTF-NEXT: [8] DATASEC '.rodata' size=0 vlen=1
+; CHECK-BTF-NEXT: type_id=7 offset=0 size=12
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "X", scope: !2, file: !3, line: 10, type: !4, isLocal: false, isDefinition: true, align: 32)
+!2 = !DINamespace(name: "variant_part", scope: null)
+!3 = !DIFile(filename: "variant-part/src/main.rs", directory: "/tmp/variant-part", checksumkind: CSK_MD5, checksum: "b94cd53886ea8f14cbc116b36bc7dd36")
+!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyEnum", scope: !2, file: !5, size: 96, align: 32, flags: DIFlagPublic, elements: !6, templateParams: !16, identifier: "faba668fd9f71e9b7cf3b9ac5e8b93cb")
+!5 = !DIFile(filename: "<unknown>", directory: "")
+!6 = !{!7}
+!7 = !DICompositeType(tag: DW_TAG_variant_part, scope: !4, file: !5, size: 96, align: 32, elements: !8, templateParams: !16, identifier: "e4aee046fc86d111657622fdcb8c42f7", discriminator: !21)
+!8 = !{!9, !17}
+!9 = !DIDerivedType(tag: DW_TAG_member, name: "First", scope: !7, file: !5, baseType: !10, size: 96, align: 32, extraData: i32 0)
+!10 = !DICompositeType(tag: DW_TAG_structure_type, name: "First", scope: !4, file: !5, size: 96, align: 32, flags: DIFlagPublic, elements: !11, templateParams: !16, identifier: "cc7748c842e275452db4205b190c8ff7")
+!11 = !{!12, !14}
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !10, file: !5, baseType: !13, size: 32, align: 32, offset: 32, flags: DIFlagPublic)
+!13 = !DIBasicType(name: "u32", size: 32, encoding: DW_ATE_unsigned)
+!14 = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: !10, file: !5, baseType: !15, size: 32, align: 32, offset: 64, flags: DIFlagPublic)
+!15 = !DIBasicType(name: "i32", size: 32, encoding: DW_ATE_signed)
+!16 = !{}
+!17 = !DIDerivedType(tag: DW_TAG_member, name: "Second", scope: !7, file: !5, baseType: !18, size: 96, align: 32, extraData: i32 1)
+!18 = !DICompositeType(tag: DW_TAG_structure_type, name: "Second", scope: !4, file: !5, size: 96, align: 32, flags: DIFlagPublic, elements: !19, templateParams: !16, identifier: "a2094b1381f3082d504fbd0903aa7c06")
+!19 = !{!20}
+!20 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !18, file: !5, baseType: !13, size: 32, align: 32, offset: 32, flags: DIFlagPublic)
+!21 = !DIDerivedType(tag: DW_TAG_member, scope: !4, file: !5, baseType: !13, size: 32, align: 32, flags: DIFlagArtificial)
+!22 = !{i32 8, !"PIC Level", i32 2}
+!23 = !{i32 7, !"PIE Level", i32 2}
+!24 = !{i32 7, !"Dwarf Version", i32 4}
+!25 = !{i32 2, !"Debug Info Version", i32 3}
+!26 = !{!"rustc version 1.91.0-nightly (160e7623e 2025-08-26)"}
+!27 = distinct !DICompileUnit(language: DW_LANG_Rust, file: !28, producer: "clang LLVM (rustc version 1.91.0-nightly (160e7623e 2025-08-26))", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !29, splitDebugInlining: false, nameTableKind: None)
+!28 = !DIFile(filename: "variant-part/src/main.rs/@/c0znihgkvro8hs0n88fgrtg6x", directory: "/tmp/variant-part")
+!29 = !{!0}
diff --git a/llvm/test/CodeGen/Hexagon/insert-big.ll b/llvm/test/CodeGen/Hexagon/insert-big.ll
new file mode 100644
index 0000000..8735a66
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/insert-big.ll
@@ -0,0 +1,47 @@
+; Check that llc does not abort, which happened due to incorrect MIR.
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=1 < %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=2 < %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=3 < %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=4 < %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=5 < %s
+
+; Look for this symptom, in case llc does not check invalid IR.
+; CHECK-NOT: insert(%14,%5,#5,#5)
+
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=1 -debug-only=hexinsert -stop-after hexinsert < %s 2>&1 | FileCheck %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=2 -debug-only=hexinsert -stop-after hexinsert < %s 2>&1 | FileCheck %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=3 -debug-only=hexinsert -stop-after hexinsert < %s 2>&1 | FileCheck %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=4 -debug-only=hexinsert -stop-after hexinsert < %s 2>&1 | FileCheck %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=5 -debug-only=hexinsert -stop-after hexinsert < %s 2>&1 | FileCheck %s
+
+; REQUIRES: asserts
+
+define i32 @f(i32 %0, i32 %1, i32 %2) {
+entry:
+ switch i32 %0, label %common.ret1 [
+ i32 8907, label %3
+ i32 4115, label %6
+ ]
+
+common.ret1:
+ %common.ret1.op = phi i32 [ %5, %3 ], [ %526, %6 ], [ 0, %entry ]
+ ret i32 %common.ret1.op
+
+3:
+ %4 = shl i32 %2, 5
+ %5 = and i32 %4, 992
+ br label %common.ret1
+
+6:
+ %7 = shl i32 %0, 10
+ %8 = and i32 %7, 7168
+ %9 = shl i32 %0, 5
+ %10 = and i32 %9, 992
+ %11 = or i32 %10, %8
+ %12 = and i32 %0, 1
+ %13 = or i32 %11, %12
+ %14 = shl i32 %1, 1
+ %15 = and i32 %14, 2031616
+ %526 = or i32 %13, %15
+ br label %common.ret1
+}
diff --git a/llvm/test/CodeGen/Hexagon/qfp-conv.ll b/llvm/test/CodeGen/Hexagon/qfp-conv.ll
new file mode 100644
index 0000000..d2d393e
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/qfp-conv.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mtriple=hexagon -mattr=+hvxv68,+hvx,+hvx-length128b < %s | FileCheck %s
+
+; Test that the Qfloat optimization pass doesn't crash due to an invalid
+; instructions.
+
+; CHECK: v{{[0-9]+}}.hf = v{{[0-9]:[0-9]}}.qf32
+
+define void @test(
+ <32 x i32>* %optr,
+ <64 x i32> %in64,
+ <32 x i32> %va,
+ <32 x i32> %vb
+) local_unnamed_addr #0 {
+entry:
+ br label %for.body
+
+for.body:
+ %optr.068 = phi <32 x i32>* [ %optr, %entry ], [ %incdec.ptr6, %for.body ]
+ %0 = tail call <32 x i32> @llvm.hexagon.V6.vconv.hf.qf32.128B(<64 x i32> %in64) #2
+ %1 = tail call <32 x i32> @llvm.hexagon.V6.vdealh.128B(<32 x i32> %0) #2
+ %2 = tail call <128 x i1> @llvm.hexagon.V6.vgth.128B(<32 x i32> %va, <32 x i32> %1) #2
+ %3 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<128 x i1> %2, <32 x i32> %va, <32 x i32> %vb) #2
+ %4 = tail call <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32> %3, <32 x i32> %vb) #2
+ %5 = tail call <32 x i32> @llvm.hexagon.V6.vpackhub.sat.128B(<32 x i32> %va, <32 x i32> %4) #2
+ store <32 x i32> %5, <32 x i32>* %optr.068, align 1
+ %incdec.ptr6 = getelementptr inbounds <32 x i32>, <32 x i32>* %optr.068, i32 1
+ br label %for.body
+}
+
+declare <32 x i32> @llvm.hexagon.V6.vdealh.128B(<32 x i32>) #1
+declare <32 x i32> @llvm.hexagon.V6.vconv.hf.qf32.128B(<64 x i32>) #1
+declare <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32>, <32 x i32>) #1
+declare <32 x i32> @llvm.hexagon.V6.vpackhub.sat.128B(<32 x i32>, <32 x i32>) #1
+declare <128 x i1> @llvm.hexagon.V6.vgth.128B(<32 x i32>, <32 x i32>) #1
+declare <32 x i32> @llvm.hexagon.V6.vmux.128B(<128 x i1>, <32 x i32>, <32 x i32>) #1
diff --git a/llvm/test/CodeGen/Hexagon/qfp-enabled.ll b/llvm/test/CodeGen/Hexagon/qfp-enabled.ll
new file mode 100644
index 0000000..a5cc5fa
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/qfp-enabled.ll
@@ -0,0 +1,19 @@
+; Tests if the flag to disable qfp optimizer pass works or not.
+
+; RUN: llc -march=hexagon -mcpu=hexagonv69 -mattr=+hvxv69,+hvx-length128b \
+; RUN: < %s -o -| FileCheck %s --check-prefix=ENABLED
+; RUN: llc -march=hexagon -mcpu=hexagonv69 -mattr=+hvxv69,+hvx-length128b \
+; RUN: -disable-qfp-opt < %s -o -| FileCheck %s --check-prefix=DISABLED
+
+define dso_local <32 x i32> @conv1_qf32(<32 x i32> noundef %input1, <32 x i32> noundef %input2) local_unnamed_addr {
+entry:
+; DISABLED: [[V2:v[0-9]+]].qf32 = vadd(v0.sf,v1.sf)
+; DISABLED: [[V3:v[0-9]+]].sf = [[V2]].qf32
+; DISABLED: qf32 = vadd(v0.sf,[[V3]].sf)
+; ENABLED: [[V4:v[0-9]+]].qf32 = vadd(v0.sf,v1.sf)
+; ENABLED: qf32 = vadd([[V4]].qf32,v0.sf)
+ %0 = tail call <32 x i32> @llvm.hexagon.V6.vadd.sf.128B(<32 x i32> %input1, <32 x i32> %input2)
+ %1 = tail call <32 x i32> @llvm.hexagon.V6.vconv.sf.qf32.128B(<32 x i32> %0)
+ %2 = tail call <32 x i32> @llvm.hexagon.V6.vadd.sf.128B(<32 x i32> %input1, <32 x i32> %1)
+ ret <32 x i32> %2
+}
diff --git a/llvm/test/CodeGen/Hexagon/qfp-remove-kill.mir b/llvm/test/CodeGen/Hexagon/qfp-remove-kill.mir
new file mode 100644
index 0000000..d8dde7d
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/qfp-remove-kill.mir
@@ -0,0 +1,95 @@
+# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b \
+# RUN: -run-pass hexagon-qfp-optimizer -run-pass machineverifier %s -o - | FileCheck %s
+
+# Test that the killed RegState from DefMI operands are removed
+# killed RegState should be set for MI operands
+# CHECK-LABEL: name: qfpAdd
+# CHECK: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG1:([0-9]+)]]
+# CHECK-NEXT: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG2:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32 killed %[[REG1]], killed %[[REG2]]
+# CHECK-NEXT: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG3:([0-9]+)]]
+# CHECK-NEXT: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG4:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32 killed %[[REG3]], killed %[[REG4]]
+
+---
+name: qfpAdd
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $r0, $r1, $r2, $r3
+ %0:intregs = COPY $r0
+ %1:intregs = COPY $r1
+ %2:intregs = COPY $r2
+ %3:intregs = COPY $r3
+ %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+ %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+ %6:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+ %7:hvxvr = V6_vL32Ub_ai %3:intregs, 0
+ %8:hvxvr = V6_vconv_sf_qf32 killed %4:hvxvr
+ %9:hvxvr = V6_vconv_sf_qf32 killed %5:hvxvr
+ %10:hvxvr = V6_vadd_sf %8:hvxvr, %9:hvxvr
+ %11:hvxvr = V6_vconv_sf_qf32 killed %6:hvxvr
+ %12:hvxvr = V6_vconv_sf_qf32 killed %7:hvxvr
+ %13:hvxvr = V6_vadd_sf killed %11:hvxvr, killed %12:hvxvr
+...
+
+
+# Test that the killed RegState from DefMI operands are removed
+# CHECK-LABEL: name: qfpAddMix
+# CHECK: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG1:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32_mix killed %[[REG1]], %{{[0-9]+}}
+# CHECK: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG2:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32_mix killed %[[REG2]], %{{[0-9]+}}
+
+---
+name: qfpAddMix
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $r0, $r1, $r2
+ %0:intregs = COPY $r0
+ %1:intregs = COPY $r1
+ %2:intregs = COPY $r2
+ %3:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+ %4:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+ %5:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+ %6:hvxvr = V6_vmpy_qf32_sf %4, %5
+ %7:hvxvr = V6_vconv_sf_qf32 killed %6:hvxvr
+ %8:hvxvr = V6_vadd_sf %3:hvxvr, %7:hvxvr
+ %9:hvxvr = V6_vmpy_qf32_sf %4, %5
+ %10:hvxvr = V6_vconv_sf_qf32 killed %9:hvxvr
+ %11:hvxvr = V6_vadd_sf %3:hvxvr, killed %10:hvxvr
+...
+
+
+# Test that we do generate V6_vsub_qf32_mix for the below test.
+# V6_vsub_qf32_mix only allowes qf32 as first operand. In the test qf32
+# is passed as first operand. So, V6_vsub_qf32_mix must be generated.
+# CHECK-LABEL: name: qfpAddSwapMix
+# CHECK: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG1:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32_mix killed %[[REG1]], %{{[0-9]+}}
+# CHECK: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG2:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32_mix killed %[[REG2]], %{{[0-9]+}}
+
+---
+name: qfpAddSwapMix
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $r0, $r1, $r2
+ %0:intregs = COPY $r0
+ %1:intregs = COPY $r1
+ %2:intregs = COPY $r2
+ %3:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+ %4:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+ %5:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+ %6:hvxvr = V6_vmpy_qf32_sf %4, %5
+ %7:hvxvr = V6_vconv_sf_qf32 killed %6:hvxvr
+ %8:hvxvr = V6_vadd_sf %7:hvxvr, %3:hvxvr
+ %9:hvxvr = V6_vmpy_qf32_sf %4, %5
+ %10:hvxvr = V6_vconv_sf_qf32 killed %9:hvxvr
+ %11:hvxvr = V6_vadd_sf killed %10:hvxvr, %3:hvxvr
+...
diff --git a/llvm/test/CodeGen/Hexagon/qfp-subreg-bug.mir b/llvm/test/CodeGen/Hexagon/qfp-subreg-bug.mir
new file mode 100644
index 0000000..1d78203
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/qfp-subreg-bug.mir
@@ -0,0 +1,33 @@
+# RUN: llc -march=hexagon -mcpu=hexagonv69 -mattr=+hvxv69,+hvx-length128b -run-pass hexagon-qfp-optimizer %s -o - | FileCheck %s
+
+# CHECK: V6_vshuffvdd
+# CHECK: V6_vadd_sf
+# CHECK: V6_vadd_qf32_mix{{.*}}vsub_lo
+# CHECK: V6_vadd_qf32_mix{{.*}}vsub_hi
+
+---
+name: qfp_subreg_fix
+alignment: 16
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ %10:intregs = IMPLICIT_DEF
+ %9:hvxvr = V6_vL32Ub_ai %10, 0 :: (load (s1024) from `ptr undef`, align 4)
+ %11:intregs = A2_tfrsi 15360
+ %12:hvxvr = V6_lvsplath %11
+ %13:hvxwr = V6_vmpy_qf32_hf %9, %12
+ %15:hvxvr = V6_vconv_sf_qf32 %13.vsub_lo
+ %17:hvxvr = V6_vconv_sf_qf32 %13.vsub_hi
+ %18:intregslow8 = A2_tfrsi -4
+ %19:hvxwr = V6_vshuffvdd %17, %15, %18
+ %21:hvxvr = V6_vadd_sf %19.vsub_hi, %19.vsub_hi
+ %22:hvxvr = V6_vconv_sf_qf32 %21
+ %24:hvxvr = V6_vadd_sf %19.vsub_lo, %19.vsub_lo
+ %25:hvxvr = V6_vconv_sf_qf32 %24
+ %26:hvxvr = V6_vadd_sf %25, %19.vsub_lo
+ %27:hvxvr = V6_vconv_sf_qf32 %26
+ %28:hvxvr = V6_vadd_sf %22, %19.vsub_hi
+ %29:hvxvr = V6_vconv_sf_qf32 %28
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll b/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll
new file mode 100644
index 0000000..c16370c
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll
@@ -0,0 +1,21 @@
+; Tests if generated vadd instruction takes in qf32
+; type as first parameter instead of a sf type without
+; any conversion instruction of type sf = qf32
+
+; RUN: llc -mtriple=hexagon < %s -o - | FileCheck %s
+
+; CHECK: [[V2:v[0-9]+]] = vxor([[V2]],[[V2]])
+; CHECK: [[V0:v[0-9]+]].qf32 = vmpy([[V0]].sf,[[V2]].sf)
+; CHECK: [[V1:v[0-9]+]].qf32 = vmpy([[V1]].sf,[[V2]].sf)
+; CHECK: [[V4:v[0-9]+]].qf32 = vadd([[V0]].qf32,[[V2]].sf)
+; CHECK: [[V5:v[0-9]+]].qf32 = vadd([[V1]].qf32,[[V2]].sf)
+
+define void @_Z19compute_ripple_geluIDF16_EviPT_PKS0_(ptr %out_ptr, <64 x float> %conv14.ripple.vectorized) #0 {
+entry:
+ %mul16.ripple.vectorized = fmul <64 x float> %conv14.ripple.vectorized, zeroinitializer
+ %conv17.ripple.vectorized = fptrunc <64 x float> %mul16.ripple.vectorized to <64 x half>
+ store <64 x half> %conv17.ripple.vectorized, ptr %out_ptr, align 2
+ ret void
+}
+
+attributes #0 = { "target-features"="+hvx-length128b,+hvxv75,+v75,-long-calls,-small-data" }
diff --git a/llvm/test/CodeGen/Hexagon/vect/qfp-mix.mir b/llvm/test/CodeGen/Hexagon/vect/qfp-mix.mir
new file mode 100644
index 0000000..9a9e938
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vect/qfp-mix.mir
@@ -0,0 +1,79 @@
+# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b \
+# RUN: -run-pass hexagon-qfp-optimizer %s -o - | FileCheck %s
+
+
+# Test that the operands are swapped for Add if the second operand
+# is a qf32 to sf conversion. V6_vadd_qf32_mix supports first operand
+# as qf32.
+# CHECK-LABEL: name: qfpAddMix
+# CHECK: %[[REG:([0-9]+)]]:hvxvr = V6_vmpy_qf32_sf
+# CHECK: V6_vadd_qf32_mix %[[REG]]
+
+---
+name: qfpAddMix
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $r0, $r1, $r2, $r3
+ %0:intregs = COPY $r0
+ %1:intregs = COPY $r1
+ %2:intregs = COPY $r2
+ %3:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+ %4:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+ %5:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+ %6:hvxvr = V6_vmpy_qf32_sf %4, %5
+ %7:hvxvr = V6_vconv_sf_qf32 %6:hvxvr
+ %8:hvxvr = V6_vadd_sf %3:hvxvr, %7:hvxvr
+...
+
+
+# Test that we do not generate V6_vsub_qf32_mix for the below test.
+# V6_vsub_qf32_mix only allowes qf32 as first operand. In the test qf32
+# is passed as second operand. As sub is not commutative, we should not
+# generate the mix instruction.
+# CHECK-LABEL: name: qfpSubNoMix
+# CHECK-NOT: V6_vsub_qf32_mix
+
+---
+name: qfpSubNoMix
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $r0, $r1, $r2, $r3
+ %0:intregs = COPY $r0
+ %1:intregs = COPY $r1
+ %2:intregs = COPY $r2
+ %3:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+ %4:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+ %5:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+ %6:hvxvr = V6_vmpy_qf32_sf %4, %5
+ %7:hvxvr = V6_vconv_sf_qf32 %6:hvxvr
+ %8:hvxvr = V6_vsub_sf %3:hvxvr, %7:hvxvr
+...
+
+
+# Test that we do generate V6_vsub_qf32_mix for the below test.
+# V6_vsub_qf32_mix only allowes qf32 as first operand. In the test qf32
+# is passed as first operand. So, V6_vsub_qf32_mix must be generated.
+# CHECK-LABEL: name: qfpSubMix
+# CHECK: V6_vsub_qf32_mix
+
+---
+name: qfpSubMix
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $r0, $r1, $r2, $r3
+ %0:intregs = COPY $r0
+ %1:intregs = COPY $r1
+ %2:intregs = COPY $r2
+ %3:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+ %4:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+ %5:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+ %6:hvxvr = V6_vmpy_qf32_sf %4, %5
+ %7:hvxvr = V6_vconv_sf_qf32 %6:hvxvr
+ %8:hvxvr = V6_vsub_sf %7:hvxvr, %3:hvxvr
+...
diff --git a/llvm/test/CodeGen/Hexagon/vect/qfp-zeroinit.mir b/llvm/test/CodeGen/Hexagon/vect/qfp-zeroinit.mir
new file mode 100644
index 0000000..f0b1d3c
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vect/qfp-zeroinit.mir
@@ -0,0 +1,23 @@
+# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b -run-pass hexagon-qfp-optimizer %s -o - | FileCheck %s
+
+# CHECK-LABEL: name: qfpAdd32
+# CHECK: V6_vd0
+# CHECK-NEXT: V6_vL32Ub_ai
+# CHECK-NEXT: V6_vadd_sf
+# CHECK-NEXT: V6_vconv_sf_qf32
+# CHECK-NEXT: V6_vS32Ub_ai
+---
+name: qfpAdd32
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $r0, $r1
+ %0:intregs = COPY $r0
+ %1:intregs = COPY $r1
+ %3:hvxvr = V6_vd0
+ %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+ %5:hvxvr = V6_vadd_sf %3:hvxvr, %4:hvxvr
+ %6:hvxvr = V6_vconv_sf_qf32 %5:hvxvr
+ V6_vS32Ub_ai %1:intregs, 0, %6:hvxvr
+...
diff --git a/llvm/test/CodeGen/Hexagon/vect/unique-vreg-def.ll b/llvm/test/CodeGen/Hexagon/vect/unique-vreg-def.ll
new file mode 100644
index 0000000..2d46da7
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vect/unique-vreg-def.ll
@@ -0,0 +1,32 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; REQUIRES: hexagon
+
+; This test was asserting because getVRegDef() was called on a register with
+; multiple defs.
+; Checks that the test does not assert and vsub is generated.
+; CHECK: vsub
+
+target triple = "hexagon"
+
+@v = common dso_local local_unnamed_addr global <32 x i32> zeroinitializer, align 128
+
+; Function Attrs: nounwind
+define dso_local void @hvx_twoSum(<32 x i32>* nocapture noundef writeonly %s2lo) local_unnamed_addr #0 {
+entry:
+ %0 = load <32 x i32>, <32 x i32>* @v, align 128
+ %call = tail call inreg <32 x i32> @MY_Vsf_equals_Vqf32(<32 x i32> noundef %0) #3
+ %1 = tail call <32 x i32> @llvm.hexagon.V6.vsub.sf.128B(<32 x i32> %call, <32 x i32> %call)
+ store <32 x i32> %1, <32 x i32>* @v, align 128
+ store <32 x i32> %1, <32 x i32>* %s2lo, align 128
+ ret void
+}
+
+declare dso_local inreg <32 x i32> @MY_Vsf_equals_Vqf32(<32 x i32> noundef) local_unnamed_addr #1
+
+; Function Attrs: nofree nosync nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vsub.sf.128B(<32 x i32>, <32 x i32>) #2
+
+attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv73" "target-features"="+hvx-length128b,+hvxv73,+v73,-long-calls" }
+attributes #1 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv73" "target-features"="+hvx-length128b,+hvxv73,+v73,-long-calls" }
+attributes #2 = { nofree nosync nounwind readnone }
+attributes #3 = { nounwind }
diff --git a/llvm/test/CodeGen/PowerPC/all-atomics.ll b/llvm/test/CodeGen/PowerPC/all-atomics.ll
index 7e892fc..93968b71 100644
--- a/llvm/test/CodeGen/PowerPC/all-atomics.ll
+++ b/llvm/test/CodeGen/PowerPC/all-atomics.ll
@@ -33,7 +33,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 5, 0, 4
; CHECK-NEXT: addi 5, 5, 1
; CHECK-NEXT: stbcx. 5, 0, 4
-; CHECK-NEXT: bne 0, .LBB0_1
+; CHECK-NEXT: bne- 0, .LBB0_1
; CHECK-NEXT: # %bb.2: # %entry
; CHECK-NEXT: addis 5, 2, uc@toc@ha
; CHECK-NEXT: lwsync
@@ -44,7 +44,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 6, 0, 5
; CHECK-NEXT: addi 6, 6, 1
; CHECK-NEXT: stbcx. 6, 0, 5
-; CHECK-NEXT: bne 0, .LBB0_3
+; CHECK-NEXT: bne- 0, .LBB0_3
; CHECK-NEXT: # %bb.4: # %entry
; CHECK-NEXT: addis 6, 2, ss@toc@ha
; CHECK-NEXT: lwsync
@@ -55,7 +55,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 7, 0, 6
; CHECK-NEXT: addi 7, 7, 1
; CHECK-NEXT: sthcx. 7, 0, 6
-; CHECK-NEXT: bne 0, .LBB0_5
+; CHECK-NEXT: bne- 0, .LBB0_5
; CHECK-NEXT: # %bb.6: # %entry
; CHECK-NEXT: addis 7, 2, us@toc@ha
; CHECK-NEXT: lwsync
@@ -66,7 +66,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 7, 0, 8
; CHECK-NEXT: addi 7, 7, 1
; CHECK-NEXT: sthcx. 7, 0, 8
-; CHECK-NEXT: bne 0, .LBB0_7
+; CHECK-NEXT: bne- 0, .LBB0_7
; CHECK-NEXT: # %bb.8: # %entry
; CHECK-NEXT: addis 7, 2, si@toc@ha
; CHECK-NEXT: lwsync
@@ -77,7 +77,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 7, 0, 9
; CHECK-NEXT: addi 7, 7, 1
; CHECK-NEXT: stwcx. 7, 0, 9
-; CHECK-NEXT: bne 0, .LBB0_9
+; CHECK-NEXT: bne- 0, .LBB0_9
; CHECK-NEXT: # %bb.10: # %entry
; CHECK-NEXT: addis 7, 2, ui@toc@ha
; CHECK-NEXT: lwsync
@@ -88,7 +88,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 7, 0, 10
; CHECK-NEXT: addi 7, 7, 1
; CHECK-NEXT: stwcx. 7, 0, 10
-; CHECK-NEXT: bne 0, .LBB0_11
+; CHECK-NEXT: bne- 0, .LBB0_11
; CHECK-NEXT: # %bb.12: # %entry
; CHECK-NEXT: addis 7, 2, sll@toc@ha
; CHECK-NEXT: lwsync
@@ -100,7 +100,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 12, 0, 11
; CHECK-NEXT: addi 12, 12, 1
; CHECK-NEXT: stdcx. 12, 0, 11
-; CHECK-NEXT: bne 0, .LBB0_13
+; CHECK-NEXT: bne- 0, .LBB0_13
; CHECK-NEXT: # %bb.14: # %entry
; CHECK-NEXT: addis 12, 2, ull@toc@ha
; CHECK-NEXT: lwsync
@@ -111,7 +111,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 30, 0, 12
; CHECK-NEXT: addi 0, 30, 1
; CHECK-NEXT: stdcx. 0, 0, 12
-; CHECK-NEXT: bne 0, .LBB0_15
+; CHECK-NEXT: bne- 0, .LBB0_15
; CHECK-NEXT: # %bb.16: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -120,7 +120,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 0, 0, 4
; CHECK-NEXT: sub 0, 0, 3
; CHECK-NEXT: stbcx. 0, 0, 4
-; CHECK-NEXT: bne 0, .LBB0_17
+; CHECK-NEXT: bne- 0, .LBB0_17
; CHECK-NEXT: # %bb.18: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -129,7 +129,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 0, 0, 5
; CHECK-NEXT: sub 0, 0, 3
; CHECK-NEXT: stbcx. 0, 0, 5
-; CHECK-NEXT: bne 0, .LBB0_19
+; CHECK-NEXT: bne- 0, .LBB0_19
; CHECK-NEXT: # %bb.20: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -138,7 +138,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 0, 0, 6
; CHECK-NEXT: sub 0, 0, 3
; CHECK-NEXT: sthcx. 0, 0, 6
-; CHECK-NEXT: bne 0, .LBB0_21
+; CHECK-NEXT: bne- 0, .LBB0_21
; CHECK-NEXT: # %bb.22: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -147,7 +147,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 0, 0, 8
; CHECK-NEXT: sub 0, 0, 3
; CHECK-NEXT: sthcx. 0, 0, 8
-; CHECK-NEXT: bne 0, .LBB0_23
+; CHECK-NEXT: bne- 0, .LBB0_23
; CHECK-NEXT: # %bb.24: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -156,7 +156,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 0, 0, 9
; CHECK-NEXT: sub 0, 0, 3
; CHECK-NEXT: stwcx. 0, 0, 9
-; CHECK-NEXT: bne 0, .LBB0_25
+; CHECK-NEXT: bne- 0, .LBB0_25
; CHECK-NEXT: # %bb.26: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -165,7 +165,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 0, 0, 10
; CHECK-NEXT: sub 0, 0, 3
; CHECK-NEXT: stwcx. 0, 0, 10
-; CHECK-NEXT: bne 0, .LBB0_27
+; CHECK-NEXT: bne- 0, .LBB0_27
; CHECK-NEXT: # %bb.28: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -174,7 +174,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 0, 0, 11
; CHECK-NEXT: sub 0, 0, 7
; CHECK-NEXT: stdcx. 0, 0, 11
-; CHECK-NEXT: bne 0, .LBB0_29
+; CHECK-NEXT: bne- 0, .LBB0_29
; CHECK-NEXT: # %bb.30: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -183,7 +183,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 0, 0, 12
; CHECK-NEXT: sub 0, 0, 7
; CHECK-NEXT: stdcx. 0, 0, 12
-; CHECK-NEXT: bne 0, .LBB0_31
+; CHECK-NEXT: bne- 0, .LBB0_31
; CHECK-NEXT: # %bb.32: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -192,7 +192,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 0, 0, 4
; CHECK-NEXT: ori 0, 0, 1
; CHECK-NEXT: stbcx. 0, 0, 4
-; CHECK-NEXT: bne 0, .LBB0_33
+; CHECK-NEXT: bne- 0, .LBB0_33
; CHECK-NEXT: # %bb.34: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -201,7 +201,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 0, 0, 5
; CHECK-NEXT: ori 0, 0, 1
; CHECK-NEXT: stbcx. 0, 0, 5
-; CHECK-NEXT: bne 0, .LBB0_35
+; CHECK-NEXT: bne- 0, .LBB0_35
; CHECK-NEXT: # %bb.36: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -210,7 +210,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 0, 0, 6
; CHECK-NEXT: ori 0, 0, 1
; CHECK-NEXT: sthcx. 0, 0, 6
-; CHECK-NEXT: bne 0, .LBB0_37
+; CHECK-NEXT: bne- 0, .LBB0_37
; CHECK-NEXT: # %bb.38: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -219,7 +219,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 0, 0, 8
; CHECK-NEXT: ori 0, 0, 1
; CHECK-NEXT: sthcx. 0, 0, 8
-; CHECK-NEXT: bne 0, .LBB0_39
+; CHECK-NEXT: bne- 0, .LBB0_39
; CHECK-NEXT: # %bb.40: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -228,7 +228,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 0, 0, 9
; CHECK-NEXT: ori 0, 0, 1
; CHECK-NEXT: stwcx. 0, 0, 9
-; CHECK-NEXT: bne 0, .LBB0_41
+; CHECK-NEXT: bne- 0, .LBB0_41
; CHECK-NEXT: # %bb.42: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -237,7 +237,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 0, 0, 10
; CHECK-NEXT: ori 0, 0, 1
; CHECK-NEXT: stwcx. 0, 0, 10
-; CHECK-NEXT: bne 0, .LBB0_43
+; CHECK-NEXT: bne- 0, .LBB0_43
; CHECK-NEXT: # %bb.44: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -246,7 +246,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 0, 0, 11
; CHECK-NEXT: ori 0, 0, 1
; CHECK-NEXT: stdcx. 0, 0, 11
-; CHECK-NEXT: bne 0, .LBB0_45
+; CHECK-NEXT: bne- 0, .LBB0_45
; CHECK-NEXT: # %bb.46: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -255,7 +255,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 0, 0, 12
; CHECK-NEXT: ori 0, 0, 1
; CHECK-NEXT: stdcx. 0, 0, 12
-; CHECK-NEXT: bne 0, .LBB0_47
+; CHECK-NEXT: bne- 0, .LBB0_47
; CHECK-NEXT: # %bb.48: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -264,7 +264,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 0, 0, 4
; CHECK-NEXT: xori 0, 0, 1
; CHECK-NEXT: stbcx. 0, 0, 4
-; CHECK-NEXT: bne 0, .LBB0_49
+; CHECK-NEXT: bne- 0, .LBB0_49
; CHECK-NEXT: # %bb.50: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -273,7 +273,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 0, 0, 5
; CHECK-NEXT: xori 0, 0, 1
; CHECK-NEXT: stbcx. 0, 0, 5
-; CHECK-NEXT: bne 0, .LBB0_51
+; CHECK-NEXT: bne- 0, .LBB0_51
; CHECK-NEXT: # %bb.52: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -282,7 +282,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 0, 0, 6
; CHECK-NEXT: xori 0, 0, 1
; CHECK-NEXT: sthcx. 0, 0, 6
-; CHECK-NEXT: bne 0, .LBB0_53
+; CHECK-NEXT: bne- 0, .LBB0_53
; CHECK-NEXT: # %bb.54: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -291,7 +291,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 0, 0, 8
; CHECK-NEXT: xori 0, 0, 1
; CHECK-NEXT: sthcx. 0, 0, 8
-; CHECK-NEXT: bne 0, .LBB0_55
+; CHECK-NEXT: bne- 0, .LBB0_55
; CHECK-NEXT: # %bb.56: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -300,7 +300,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 0, 0, 9
; CHECK-NEXT: xori 0, 0, 1
; CHECK-NEXT: stwcx. 0, 0, 9
-; CHECK-NEXT: bne 0, .LBB0_57
+; CHECK-NEXT: bne- 0, .LBB0_57
; CHECK-NEXT: # %bb.58: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -309,7 +309,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 0, 0, 10
; CHECK-NEXT: xori 0, 0, 1
; CHECK-NEXT: stwcx. 0, 0, 10
-; CHECK-NEXT: bne 0, .LBB0_59
+; CHECK-NEXT: bne- 0, .LBB0_59
; CHECK-NEXT: # %bb.60: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -318,7 +318,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 0, 0, 11
; CHECK-NEXT: xori 0, 0, 1
; CHECK-NEXT: stdcx. 0, 0, 11
-; CHECK-NEXT: bne 0, .LBB0_61
+; CHECK-NEXT: bne- 0, .LBB0_61
; CHECK-NEXT: # %bb.62: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -327,7 +327,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 0, 0, 12
; CHECK-NEXT: xori 0, 0, 1
; CHECK-NEXT: stdcx. 0, 0, 12
-; CHECK-NEXT: bne 0, .LBB0_63
+; CHECK-NEXT: bne- 0, .LBB0_63
; CHECK-NEXT: # %bb.64: # %entry
; CHECK-NEXT: addis 30, 2, u128@toc@ha
; CHECK-NEXT: lwsync
@@ -361,7 +361,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 0, 0, 4
; CHECK-NEXT: nand 0, 3, 0
; CHECK-NEXT: stbcx. 0, 0, 4
-; CHECK-NEXT: bne 0, .LBB0_69
+; CHECK-NEXT: bne- 0, .LBB0_69
; CHECK-NEXT: # %bb.70: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -370,7 +370,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 0, 0, 5
; CHECK-NEXT: nand 0, 3, 0
; CHECK-NEXT: stbcx. 0, 0, 5
-; CHECK-NEXT: bne 0, .LBB0_71
+; CHECK-NEXT: bne- 0, .LBB0_71
; CHECK-NEXT: # %bb.72: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -379,7 +379,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 0, 0, 6
; CHECK-NEXT: nand 0, 3, 0
; CHECK-NEXT: sthcx. 0, 0, 6
-; CHECK-NEXT: bne 0, .LBB0_73
+; CHECK-NEXT: bne- 0, .LBB0_73
; CHECK-NEXT: # %bb.74: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -388,7 +388,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 0, 0, 8
; CHECK-NEXT: nand 0, 3, 0
; CHECK-NEXT: sthcx. 0, 0, 8
-; CHECK-NEXT: bne 0, .LBB0_75
+; CHECK-NEXT: bne- 0, .LBB0_75
; CHECK-NEXT: # %bb.76: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -397,7 +397,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 0, 0, 9
; CHECK-NEXT: nand 0, 3, 0
; CHECK-NEXT: stwcx. 0, 0, 9
-; CHECK-NEXT: bne 0, .LBB0_77
+; CHECK-NEXT: bne- 0, .LBB0_77
; CHECK-NEXT: # %bb.78: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -406,7 +406,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 0, 0, 10
; CHECK-NEXT: nand 0, 3, 0
; CHECK-NEXT: stwcx. 0, 0, 10
-; CHECK-NEXT: bne 0, .LBB0_79
+; CHECK-NEXT: bne- 0, .LBB0_79
; CHECK-NEXT: # %bb.80: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -415,7 +415,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 0, 0, 11
; CHECK-NEXT: nand 0, 7, 0
; CHECK-NEXT: stdcx. 0, 0, 11
-; CHECK-NEXT: bne 0, .LBB0_81
+; CHECK-NEXT: bne- 0, .LBB0_81
; CHECK-NEXT: # %bb.82: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -424,7 +424,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 0, 0, 12
; CHECK-NEXT: nand 0, 7, 0
; CHECK-NEXT: stdcx. 0, 0, 12
-; CHECK-NEXT: bne 0, .LBB0_83
+; CHECK-NEXT: bne- 0, .LBB0_83
; CHECK-NEXT: # %bb.84: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -433,7 +433,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 0, 0, 4
; CHECK-NEXT: and 0, 3, 0
; CHECK-NEXT: stbcx. 0, 0, 4
-; CHECK-NEXT: bne 0, .LBB0_85
+; CHECK-NEXT: bne- 0, .LBB0_85
; CHECK-NEXT: # %bb.86: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -442,7 +442,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 4, 0, 5
; CHECK-NEXT: and 4, 3, 4
; CHECK-NEXT: stbcx. 4, 0, 5
-; CHECK-NEXT: bne 0, .LBB0_87
+; CHECK-NEXT: bne- 0, .LBB0_87
; CHECK-NEXT: # %bb.88: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -451,7 +451,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 4, 0, 6
; CHECK-NEXT: and 4, 3, 4
; CHECK-NEXT: sthcx. 4, 0, 6
-; CHECK-NEXT: bne 0, .LBB0_89
+; CHECK-NEXT: bne- 0, .LBB0_89
; CHECK-NEXT: # %bb.90: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -460,7 +460,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 4, 0, 8
; CHECK-NEXT: and 4, 3, 4
; CHECK-NEXT: sthcx. 4, 0, 8
-; CHECK-NEXT: bne 0, .LBB0_91
+; CHECK-NEXT: bne- 0, .LBB0_91
; CHECK-NEXT: # %bb.92: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -469,7 +469,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 4, 0, 9
; CHECK-NEXT: and 4, 3, 4
; CHECK-NEXT: stwcx. 4, 0, 9
-; CHECK-NEXT: bne 0, .LBB0_93
+; CHECK-NEXT: bne- 0, .LBB0_93
; CHECK-NEXT: # %bb.94: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -478,7 +478,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 4, 0, 10
; CHECK-NEXT: and 4, 3, 4
; CHECK-NEXT: stwcx. 4, 0, 10
-; CHECK-NEXT: bne 0, .LBB0_95
+; CHECK-NEXT: bne- 0, .LBB0_95
; CHECK-NEXT: # %bb.96: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -487,7 +487,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 3, 0, 11
; CHECK-NEXT: and 3, 7, 3
; CHECK-NEXT: stdcx. 3, 0, 11
-; CHECK-NEXT: bne 0, .LBB0_97
+; CHECK-NEXT: bne- 0, .LBB0_97
; CHECK-NEXT: # %bb.98: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sync
@@ -496,7 +496,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 3, 0, 12
; CHECK-NEXT: and 3, 7, 3
; CHECK-NEXT: stdcx. 3, 0, 12
-; CHECK-NEXT: bne 0, .LBB0_99
+; CHECK-NEXT: bne- 0, .LBB0_99
; CHECK-NEXT: # %bb.100: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: ld 30, -16(1) # 8-byte Folded Reload
@@ -545,7 +545,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 4, 5, 4
; AIX32-NEXT: stwcx. 4, 0, 28
-; AIX32-NEXT: bne 0, L..BB0_1
+; AIX32-NEXT: bne- 0, L..BB0_1
; AIX32-NEXT: # %bb.2: # %entry
; AIX32-NEXT: lwz 3, L..C1(2) # @uc
; AIX32-NEXT: lwsync
@@ -564,7 +564,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 4, 5, 4
; AIX32-NEXT: stwcx. 4, 0, 27
-; AIX32-NEXT: bne 0, L..BB0_3
+; AIX32-NEXT: bne- 0, L..BB0_3
; AIX32-NEXT: # %bb.4: # %entry
; AIX32-NEXT: lwz 3, L..C2(2) # @ss
; AIX32-NEXT: lwsync
@@ -584,7 +584,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 4, 5, 4
; AIX32-NEXT: stwcx. 4, 0, 25
-; AIX32-NEXT: bne 0, L..BB0_5
+; AIX32-NEXT: bne- 0, L..BB0_5
; AIX32-NEXT: # %bb.6: # %entry
; AIX32-NEXT: lwz 3, L..C3(2) # @us
; AIX32-NEXT: lwsync
@@ -604,7 +604,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 4, 5, 4
; AIX32-NEXT: stwcx. 4, 0, 23
-; AIX32-NEXT: bne 0, L..BB0_7
+; AIX32-NEXT: bne- 0, L..BB0_7
; AIX32-NEXT: # %bb.8: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: lwz 20, L..C4(2) # @si
@@ -614,7 +614,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 3, 0, 20
; AIX32-NEXT: addi 3, 3, 1
; AIX32-NEXT: stwcx. 3, 0, 20
-; AIX32-NEXT: bne 0, L..BB0_9
+; AIX32-NEXT: bne- 0, L..BB0_9
; AIX32-NEXT: # %bb.10: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: lwz 19, L..C5(2) # @ui
@@ -624,7 +624,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 3, 0, 19
; AIX32-NEXT: addi 3, 3, 1
; AIX32-NEXT: stwcx. 3, 0, 19
-; AIX32-NEXT: bne 0, L..BB0_11
+; AIX32-NEXT: bne- 0, L..BB0_11
; AIX32-NEXT: # %bb.12: # %entry
; AIX32-NEXT: lwz 31, L..C6(2) # @sll
; AIX32-NEXT: lwsync
@@ -652,7 +652,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 4, 5, 4
; AIX32-NEXT: stwcx. 4, 0, 28
-; AIX32-NEXT: bne 0, L..BB0_13
+; AIX32-NEXT: bne- 0, L..BB0_13
; AIX32-NEXT: # %bb.14: # %entry
; AIX32-NEXT: li 3, 255
; AIX32-NEXT: lwsync
@@ -666,7 +666,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 4, 5, 4
; AIX32-NEXT: stwcx. 4, 0, 27
-; AIX32-NEXT: bne 0, L..BB0_15
+; AIX32-NEXT: bne- 0, L..BB0_15
; AIX32-NEXT: # %bb.16: # %entry
; AIX32-NEXT: li 3, 0
; AIX32-NEXT: lwsync
@@ -681,7 +681,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 4, 5, 4
; AIX32-NEXT: stwcx. 4, 0, 25
-; AIX32-NEXT: bne 0, L..BB0_17
+; AIX32-NEXT: bne- 0, L..BB0_17
; AIX32-NEXT: # %bb.18: # %entry
; AIX32-NEXT: li 3, 0
; AIX32-NEXT: lwsync
@@ -696,7 +696,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 4, 5, 4
; AIX32-NEXT: stwcx. 4, 0, 23
-; AIX32-NEXT: bne 0, L..BB0_19
+; AIX32-NEXT: bne- 0, L..BB0_19
; AIX32-NEXT: # %bb.20: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: sync
@@ -705,7 +705,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 3, 0, 20
; AIX32-NEXT: sub 3, 3, 15
; AIX32-NEXT: stwcx. 3, 0, 20
-; AIX32-NEXT: bne 0, L..BB0_21
+; AIX32-NEXT: bne- 0, L..BB0_21
; AIX32-NEXT: # %bb.22: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: sync
@@ -714,7 +714,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 3, 0, 19
; AIX32-NEXT: sub 3, 3, 15
; AIX32-NEXT: stwcx. 3, 0, 19
-; AIX32-NEXT: bne 0, L..BB0_23
+; AIX32-NEXT: bne- 0, L..BB0_23
; AIX32-NEXT: # %bb.24: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: li 4, 0
@@ -740,7 +740,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 4, 5, 4
; AIX32-NEXT: stwcx. 4, 0, 28
-; AIX32-NEXT: bne 0, L..BB0_25
+; AIX32-NEXT: bne- 0, L..BB0_25
; AIX32-NEXT: # %bb.26: # %entry
; AIX32-NEXT: li 3, 255
; AIX32-NEXT: lwsync
@@ -754,7 +754,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 4, 5, 4
; AIX32-NEXT: stwcx. 4, 0, 27
-; AIX32-NEXT: bne 0, L..BB0_27
+; AIX32-NEXT: bne- 0, L..BB0_27
; AIX32-NEXT: # %bb.28: # %entry
; AIX32-NEXT: li 3, 0
; AIX32-NEXT: lwsync
@@ -769,7 +769,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 4, 5, 4
; AIX32-NEXT: stwcx. 4, 0, 25
-; AIX32-NEXT: bne 0, L..BB0_29
+; AIX32-NEXT: bne- 0, L..BB0_29
; AIX32-NEXT: # %bb.30: # %entry
; AIX32-NEXT: li 3, 0
; AIX32-NEXT: lwsync
@@ -784,7 +784,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 4, 5, 4
; AIX32-NEXT: stwcx. 4, 0, 23
-; AIX32-NEXT: bne 0, L..BB0_31
+; AIX32-NEXT: bne- 0, L..BB0_31
; AIX32-NEXT: # %bb.32: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: sync
@@ -793,7 +793,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 3, 0, 20
; AIX32-NEXT: ori 3, 3, 1
; AIX32-NEXT: stwcx. 3, 0, 20
-; AIX32-NEXT: bne 0, L..BB0_33
+; AIX32-NEXT: bne- 0, L..BB0_33
; AIX32-NEXT: # %bb.34: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: sync
@@ -802,7 +802,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 3, 0, 19
; AIX32-NEXT: ori 3, 3, 1
; AIX32-NEXT: stwcx. 3, 0, 19
-; AIX32-NEXT: bne 0, L..BB0_35
+; AIX32-NEXT: bne- 0, L..BB0_35
; AIX32-NEXT: # %bb.36: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: li 4, 0
@@ -828,7 +828,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 4, 5, 4
; AIX32-NEXT: stwcx. 4, 0, 28
-; AIX32-NEXT: bne 0, L..BB0_37
+; AIX32-NEXT: bne- 0, L..BB0_37
; AIX32-NEXT: # %bb.38: # %entry
; AIX32-NEXT: li 3, 255
; AIX32-NEXT: lwsync
@@ -842,7 +842,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 4, 5, 4
; AIX32-NEXT: stwcx. 4, 0, 27
-; AIX32-NEXT: bne 0, L..BB0_39
+; AIX32-NEXT: bne- 0, L..BB0_39
; AIX32-NEXT: # %bb.40: # %entry
; AIX32-NEXT: li 3, 0
; AIX32-NEXT: lwsync
@@ -857,7 +857,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 4, 5, 4
; AIX32-NEXT: stwcx. 4, 0, 25
-; AIX32-NEXT: bne 0, L..BB0_41
+; AIX32-NEXT: bne- 0, L..BB0_41
; AIX32-NEXT: # %bb.42: # %entry
; AIX32-NEXT: li 3, 0
; AIX32-NEXT: lwsync
@@ -872,7 +872,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 4, 5, 4
; AIX32-NEXT: stwcx. 4, 0, 23
-; AIX32-NEXT: bne 0, L..BB0_43
+; AIX32-NEXT: bne- 0, L..BB0_43
; AIX32-NEXT: # %bb.44: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: sync
@@ -881,7 +881,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 3, 0, 20
; AIX32-NEXT: xori 3, 3, 1
; AIX32-NEXT: stwcx. 3, 0, 20
-; AIX32-NEXT: bne 0, L..BB0_45
+; AIX32-NEXT: bne- 0, L..BB0_45
; AIX32-NEXT: # %bb.46: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: sync
@@ -890,7 +890,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 3, 0, 19
; AIX32-NEXT: xori 3, 3, 1
; AIX32-NEXT: stwcx. 3, 0, 19
-; AIX32-NEXT: bne 0, L..BB0_47
+; AIX32-NEXT: bne- 0, L..BB0_47
; AIX32-NEXT: # %bb.48: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: li 4, 0
@@ -986,7 +986,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 4, 5, 4
; AIX32-NEXT: stwcx. 4, 0, 28
-; AIX32-NEXT: bne 0, L..BB0_53
+; AIX32-NEXT: bne- 0, L..BB0_53
; AIX32-NEXT: # %bb.54: # %atomicrmw.end
; AIX32-NEXT: li 3, 255
; AIX32-NEXT: lwsync
@@ -1001,7 +1001,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 4, 5, 4
; AIX32-NEXT: stwcx. 4, 0, 27
-; AIX32-NEXT: bne 0, L..BB0_55
+; AIX32-NEXT: bne- 0, L..BB0_55
; AIX32-NEXT: # %bb.56: # %atomicrmw.end
; AIX32-NEXT: li 3, 0
; AIX32-NEXT: lwsync
@@ -1017,7 +1017,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 4, 5, 4
; AIX32-NEXT: stwcx. 4, 0, 25
-; AIX32-NEXT: bne 0, L..BB0_57
+; AIX32-NEXT: bne- 0, L..BB0_57
; AIX32-NEXT: # %bb.58: # %atomicrmw.end
; AIX32-NEXT: li 3, 0
; AIX32-NEXT: lwsync
@@ -1033,7 +1033,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 4, 5, 4
; AIX32-NEXT: stwcx. 4, 0, 23
-; AIX32-NEXT: bne 0, L..BB0_59
+; AIX32-NEXT: bne- 0, L..BB0_59
; AIX32-NEXT: # %bb.60: # %atomicrmw.end
; AIX32-NEXT: lwsync
; AIX32-NEXT: sync
@@ -1042,7 +1042,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 3, 0, 20
; AIX32-NEXT: nand 3, 29, 3
; AIX32-NEXT: stwcx. 3, 0, 20
-; AIX32-NEXT: bne 0, L..BB0_61
+; AIX32-NEXT: bne- 0, L..BB0_61
; AIX32-NEXT: # %bb.62: # %atomicrmw.end
; AIX32-NEXT: lwsync
; AIX32-NEXT: sync
@@ -1051,7 +1051,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 3, 0, 19
; AIX32-NEXT: nand 3, 29, 3
; AIX32-NEXT: stwcx. 3, 0, 19
-; AIX32-NEXT: bne 0, L..BB0_63
+; AIX32-NEXT: bne- 0, L..BB0_63
; AIX32-NEXT: # %bb.64: # %atomicrmw.end
; AIX32-NEXT: lwz 31, L..C6(2) # @sll
; AIX32-NEXT: lwsync
@@ -1079,7 +1079,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 4, 5, 4
; AIX32-NEXT: stwcx. 4, 0, 28
-; AIX32-NEXT: bne 0, L..BB0_65
+; AIX32-NEXT: bne- 0, L..BB0_65
; AIX32-NEXT: # %bb.66: # %atomicrmw.end
; AIX32-NEXT: li 3, 255
; AIX32-NEXT: lwsync
@@ -1093,7 +1093,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 4, 5, 4
; AIX32-NEXT: stwcx. 4, 0, 27
-; AIX32-NEXT: bne 0, L..BB0_67
+; AIX32-NEXT: bne- 0, L..BB0_67
; AIX32-NEXT: # %bb.68: # %atomicrmw.end
; AIX32-NEXT: li 3, 0
; AIX32-NEXT: lwsync
@@ -1108,7 +1108,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 4, 5, 4
; AIX32-NEXT: stwcx. 4, 0, 25
-; AIX32-NEXT: bne 0, L..BB0_69
+; AIX32-NEXT: bne- 0, L..BB0_69
; AIX32-NEXT: # %bb.70: # %atomicrmw.end
; AIX32-NEXT: li 3, 0
; AIX32-NEXT: lwsync
@@ -1123,7 +1123,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 4, 5, 4
; AIX32-NEXT: stwcx. 4, 0, 23
-; AIX32-NEXT: bne 0, L..BB0_71
+; AIX32-NEXT: bne- 0, L..BB0_71
; AIX32-NEXT: # %bb.72: # %atomicrmw.end
; AIX32-NEXT: lwsync
; AIX32-NEXT: sync
@@ -1132,7 +1132,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 3, 0, 20
; AIX32-NEXT: and 3, 29, 3
; AIX32-NEXT: stwcx. 3, 0, 20
-; AIX32-NEXT: bne 0, L..BB0_73
+; AIX32-NEXT: bne- 0, L..BB0_73
; AIX32-NEXT: # %bb.74: # %atomicrmw.end
; AIX32-NEXT: lwsync
; AIX32-NEXT: sync
@@ -1141,7 +1141,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 3, 0, 19
; AIX32-NEXT: and 3, 29, 3
; AIX32-NEXT: stwcx. 3, 0, 19
-; AIX32-NEXT: bne 0, L..BB0_75
+; AIX32-NEXT: bne- 0, L..BB0_75
; AIX32-NEXT: # %bb.76: # %atomicrmw.end
; AIX32-NEXT: lwsync
; AIX32-NEXT: li 4, 0
@@ -1252,7 +1252,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 5, 0, 6
; CHECK-NEXT: addi 7, 5, 11
; CHECK-NEXT: stbcx. 7, 0, 6
-; CHECK-NEXT: bne 0, .LBB1_1
+; CHECK-NEXT: bne- 0, .LBB1_1
; CHECK-NEXT: # %bb.2: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stb 5, sc@toc@l(4)
@@ -1264,7 +1264,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 7, 0, 8
; CHECK-NEXT: addi 9, 7, 11
; CHECK-NEXT: stbcx. 9, 0, 8
-; CHECK-NEXT: bne 0, .LBB1_3
+; CHECK-NEXT: bne- 0, .LBB1_3
; CHECK-NEXT: # %bb.4: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stb 7, uc@toc@l(5)
@@ -1276,7 +1276,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 9, 0, 10
; CHECK-NEXT: addi 11, 9, 11
; CHECK-NEXT: sthcx. 11, 0, 10
-; CHECK-NEXT: bne 0, .LBB1_5
+; CHECK-NEXT: bne- 0, .LBB1_5
; CHECK-NEXT: # %bb.6: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sth 9, ss@toc@l(7)
@@ -1288,7 +1288,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 11, 0, 0
; CHECK-NEXT: addi 12, 11, 11
; CHECK-NEXT: sthcx. 12, 0, 0
-; CHECK-NEXT: bne 0, .LBB1_7
+; CHECK-NEXT: bne- 0, .LBB1_7
; CHECK-NEXT: # %bb.8: # %entry
; CHECK-NEXT: addis 12, 2, si@toc@ha
; CHECK-NEXT: lwsync
@@ -1300,7 +1300,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 11, 0, 29
; CHECK-NEXT: addi 30, 11, 11
; CHECK-NEXT: stwcx. 30, 0, 29
-; CHECK-NEXT: bne 0, .LBB1_9
+; CHECK-NEXT: bne- 0, .LBB1_9
; CHECK-NEXT: # %bb.10: # %entry
; CHECK-NEXT: addis 30, 2, ui@toc@ha
; CHECK-NEXT: lwsync
@@ -1312,7 +1312,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 11, 0, 27
; CHECK-NEXT: addi 28, 11, 11
; CHECK-NEXT: stwcx. 28, 0, 27
-; CHECK-NEXT: bne 0, .LBB1_11
+; CHECK-NEXT: bne- 0, .LBB1_11
; CHECK-NEXT: # %bb.12: # %entry
; CHECK-NEXT: addis 28, 2, sll@toc@ha
; CHECK-NEXT: lwsync
@@ -1325,7 +1325,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 26, 0, 25
; CHECK-NEXT: addi 24, 26, 11
; CHECK-NEXT: stdcx. 24, 0, 25
-; CHECK-NEXT: bne 0, .LBB1_13
+; CHECK-NEXT: bne- 0, .LBB1_13
; CHECK-NEXT: # %bb.14: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: std 26, sll@toc@l(28)
@@ -1337,7 +1337,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 23, 0, 24
; CHECK-NEXT: addi 22, 23, 11
; CHECK-NEXT: stdcx. 22, 0, 24
-; CHECK-NEXT: bne 0, .LBB1_15
+; CHECK-NEXT: bne- 0, .LBB1_15
; CHECK-NEXT: # %bb.16: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: std 23, ull@toc@l(26)
@@ -1347,7 +1347,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 23, 0, 6
; CHECK-NEXT: sub 22, 23, 3
; CHECK-NEXT: stbcx. 22, 0, 6
-; CHECK-NEXT: bne 0, .LBB1_17
+; CHECK-NEXT: bne- 0, .LBB1_17
; CHECK-NEXT: # %bb.18: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stb 23, sc@toc@l(4)
@@ -1357,7 +1357,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 23, 0, 8
; CHECK-NEXT: sub 22, 23, 3
; CHECK-NEXT: stbcx. 22, 0, 8
-; CHECK-NEXT: bne 0, .LBB1_19
+; CHECK-NEXT: bne- 0, .LBB1_19
; CHECK-NEXT: # %bb.20: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stb 23, uc@toc@l(5)
@@ -1367,7 +1367,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 23, 0, 10
; CHECK-NEXT: sub 22, 23, 3
; CHECK-NEXT: sthcx. 22, 0, 10
-; CHECK-NEXT: bne 0, .LBB1_21
+; CHECK-NEXT: bne- 0, .LBB1_21
; CHECK-NEXT: # %bb.22: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sth 23, ss@toc@l(7)
@@ -1377,7 +1377,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 23, 0, 0
; CHECK-NEXT: sub 22, 23, 3
; CHECK-NEXT: sthcx. 22, 0, 0
-; CHECK-NEXT: bne 0, .LBB1_23
+; CHECK-NEXT: bne- 0, .LBB1_23
; CHECK-NEXT: # %bb.24: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sth 23, us@toc@l(9)
@@ -1387,7 +1387,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 23, 0, 29
; CHECK-NEXT: sub 22, 23, 3
; CHECK-NEXT: stwcx. 22, 0, 29
-; CHECK-NEXT: bne 0, .LBB1_25
+; CHECK-NEXT: bne- 0, .LBB1_25
; CHECK-NEXT: # %bb.26: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stw 23, si@toc@l(12)
@@ -1397,7 +1397,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 23, 0, 27
; CHECK-NEXT: sub 22, 23, 3
; CHECK-NEXT: stwcx. 22, 0, 27
-; CHECK-NEXT: bne 0, .LBB1_27
+; CHECK-NEXT: bne- 0, .LBB1_27
; CHECK-NEXT: # %bb.28: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stw 23, ui@toc@l(30)
@@ -1407,7 +1407,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 23, 0, 25
; CHECK-NEXT: sub 22, 23, 11
; CHECK-NEXT: stdcx. 22, 0, 25
-; CHECK-NEXT: bne 0, .LBB1_29
+; CHECK-NEXT: bne- 0, .LBB1_29
; CHECK-NEXT: # %bb.30: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: std 23, sll@toc@l(28)
@@ -1417,7 +1417,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 23, 0, 24
; CHECK-NEXT: sub 22, 23, 11
; CHECK-NEXT: stdcx. 22, 0, 24
-; CHECK-NEXT: bne 0, .LBB1_31
+; CHECK-NEXT: bne- 0, .LBB1_31
; CHECK-NEXT: # %bb.32: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: std 23, ull@toc@l(26)
@@ -1427,7 +1427,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 23, 0, 6
; CHECK-NEXT: ori 22, 23, 11
; CHECK-NEXT: stbcx. 22, 0, 6
-; CHECK-NEXT: bne 0, .LBB1_33
+; CHECK-NEXT: bne- 0, .LBB1_33
; CHECK-NEXT: # %bb.34: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stb 23, sc@toc@l(4)
@@ -1437,7 +1437,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 23, 0, 8
; CHECK-NEXT: ori 22, 23, 11
; CHECK-NEXT: stbcx. 22, 0, 8
-; CHECK-NEXT: bne 0, .LBB1_35
+; CHECK-NEXT: bne- 0, .LBB1_35
; CHECK-NEXT: # %bb.36: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stb 23, uc@toc@l(5)
@@ -1447,7 +1447,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 23, 0, 10
; CHECK-NEXT: ori 22, 23, 11
; CHECK-NEXT: sthcx. 22, 0, 10
-; CHECK-NEXT: bne 0, .LBB1_37
+; CHECK-NEXT: bne- 0, .LBB1_37
; CHECK-NEXT: # %bb.38: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sth 23, ss@toc@l(7)
@@ -1457,7 +1457,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 23, 0, 0
; CHECK-NEXT: ori 22, 23, 11
; CHECK-NEXT: sthcx. 22, 0, 0
-; CHECK-NEXT: bne 0, .LBB1_39
+; CHECK-NEXT: bne- 0, .LBB1_39
; CHECK-NEXT: # %bb.40: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sth 23, us@toc@l(9)
@@ -1467,7 +1467,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 23, 0, 29
; CHECK-NEXT: ori 22, 23, 11
; CHECK-NEXT: stwcx. 22, 0, 29
-; CHECK-NEXT: bne 0, .LBB1_41
+; CHECK-NEXT: bne- 0, .LBB1_41
; CHECK-NEXT: # %bb.42: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stw 23, si@toc@l(12)
@@ -1477,7 +1477,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 23, 0, 27
; CHECK-NEXT: ori 22, 23, 11
; CHECK-NEXT: stwcx. 22, 0, 27
-; CHECK-NEXT: bne 0, .LBB1_43
+; CHECK-NEXT: bne- 0, .LBB1_43
; CHECK-NEXT: # %bb.44: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stw 23, ui@toc@l(30)
@@ -1487,7 +1487,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 23, 0, 25
; CHECK-NEXT: ori 22, 23, 11
; CHECK-NEXT: stdcx. 22, 0, 25
-; CHECK-NEXT: bne 0, .LBB1_45
+; CHECK-NEXT: bne- 0, .LBB1_45
; CHECK-NEXT: # %bb.46: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: std 23, sll@toc@l(28)
@@ -1497,7 +1497,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 23, 0, 24
; CHECK-NEXT: ori 22, 23, 11
; CHECK-NEXT: stdcx. 22, 0, 24
-; CHECK-NEXT: bne 0, .LBB1_47
+; CHECK-NEXT: bne- 0, .LBB1_47
; CHECK-NEXT: # %bb.48: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: std 23, ull@toc@l(26)
@@ -1507,7 +1507,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 23, 0, 6
; CHECK-NEXT: xori 22, 23, 11
; CHECK-NEXT: stbcx. 22, 0, 6
-; CHECK-NEXT: bne 0, .LBB1_49
+; CHECK-NEXT: bne- 0, .LBB1_49
; CHECK-NEXT: # %bb.50: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stb 23, sc@toc@l(4)
@@ -1517,7 +1517,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 23, 0, 8
; CHECK-NEXT: xori 22, 23, 11
; CHECK-NEXT: stbcx. 22, 0, 8
-; CHECK-NEXT: bne 0, .LBB1_51
+; CHECK-NEXT: bne- 0, .LBB1_51
; CHECK-NEXT: # %bb.52: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stb 23, uc@toc@l(5)
@@ -1527,7 +1527,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 23, 0, 10
; CHECK-NEXT: xori 22, 23, 11
; CHECK-NEXT: sthcx. 22, 0, 10
-; CHECK-NEXT: bne 0, .LBB1_53
+; CHECK-NEXT: bne- 0, .LBB1_53
; CHECK-NEXT: # %bb.54: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sth 23, ss@toc@l(7)
@@ -1537,7 +1537,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 23, 0, 0
; CHECK-NEXT: xori 22, 23, 11
; CHECK-NEXT: sthcx. 22, 0, 0
-; CHECK-NEXT: bne 0, .LBB1_55
+; CHECK-NEXT: bne- 0, .LBB1_55
; CHECK-NEXT: # %bb.56: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sth 23, us@toc@l(9)
@@ -1547,7 +1547,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 23, 0, 29
; CHECK-NEXT: xori 22, 23, 11
; CHECK-NEXT: stwcx. 22, 0, 29
-; CHECK-NEXT: bne 0, .LBB1_57
+; CHECK-NEXT: bne- 0, .LBB1_57
; CHECK-NEXT: # %bb.58: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stw 23, si@toc@l(12)
@@ -1557,7 +1557,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 23, 0, 27
; CHECK-NEXT: xori 22, 23, 11
; CHECK-NEXT: stwcx. 22, 0, 27
-; CHECK-NEXT: bne 0, .LBB1_59
+; CHECK-NEXT: bne- 0, .LBB1_59
; CHECK-NEXT: # %bb.60: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stw 23, ui@toc@l(30)
@@ -1567,7 +1567,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 23, 0, 25
; CHECK-NEXT: xori 22, 23, 11
; CHECK-NEXT: stdcx. 22, 0, 25
-; CHECK-NEXT: bne 0, .LBB1_61
+; CHECK-NEXT: bne- 0, .LBB1_61
; CHECK-NEXT: # %bb.62: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: std 23, sll@toc@l(28)
@@ -1577,7 +1577,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 23, 0, 24
; CHECK-NEXT: xori 22, 23, 11
; CHECK-NEXT: stdcx. 22, 0, 24
-; CHECK-NEXT: bne 0, .LBB1_63
+; CHECK-NEXT: bne- 0, .LBB1_63
; CHECK-NEXT: # %bb.64: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: std 23, ull@toc@l(26)
@@ -1587,7 +1587,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 23, 0, 6
; CHECK-NEXT: nand 22, 3, 23
; CHECK-NEXT: stbcx. 22, 0, 6
-; CHECK-NEXT: bne 0, .LBB1_65
+; CHECK-NEXT: bne- 0, .LBB1_65
; CHECK-NEXT: # %bb.66: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stb 23, sc@toc@l(4)
@@ -1597,7 +1597,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 23, 0, 8
; CHECK-NEXT: nand 22, 3, 23
; CHECK-NEXT: stbcx. 22, 0, 8
-; CHECK-NEXT: bne 0, .LBB1_67
+; CHECK-NEXT: bne- 0, .LBB1_67
; CHECK-NEXT: # %bb.68: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stb 23, uc@toc@l(5)
@@ -1607,7 +1607,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 23, 0, 10
; CHECK-NEXT: nand 22, 3, 23
; CHECK-NEXT: sthcx. 22, 0, 10
-; CHECK-NEXT: bne 0, .LBB1_69
+; CHECK-NEXT: bne- 0, .LBB1_69
; CHECK-NEXT: # %bb.70: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sth 23, ss@toc@l(7)
@@ -1617,7 +1617,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 23, 0, 0
; CHECK-NEXT: nand 22, 3, 23
; CHECK-NEXT: sthcx. 22, 0, 0
-; CHECK-NEXT: bne 0, .LBB1_71
+; CHECK-NEXT: bne- 0, .LBB1_71
; CHECK-NEXT: # %bb.72: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sth 23, us@toc@l(9)
@@ -1627,7 +1627,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 23, 0, 29
; CHECK-NEXT: nand 22, 3, 23
; CHECK-NEXT: stwcx. 22, 0, 29
-; CHECK-NEXT: bne 0, .LBB1_73
+; CHECK-NEXT: bne- 0, .LBB1_73
; CHECK-NEXT: # %bb.74: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stw 23, si@toc@l(12)
@@ -1637,7 +1637,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 23, 0, 27
; CHECK-NEXT: nand 22, 3, 23
; CHECK-NEXT: stwcx. 22, 0, 27
-; CHECK-NEXT: bne 0, .LBB1_75
+; CHECK-NEXT: bne- 0, .LBB1_75
; CHECK-NEXT: # %bb.76: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stw 23, ui@toc@l(30)
@@ -1647,7 +1647,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 23, 0, 25
; CHECK-NEXT: nand 22, 11, 23
; CHECK-NEXT: stdcx. 22, 0, 25
-; CHECK-NEXT: bne 0, .LBB1_77
+; CHECK-NEXT: bne- 0, .LBB1_77
; CHECK-NEXT: # %bb.78: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: std 23, sll@toc@l(28)
@@ -1657,7 +1657,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 23, 0, 24
; CHECK-NEXT: nand 22, 11, 23
; CHECK-NEXT: stdcx. 22, 0, 24
-; CHECK-NEXT: bne 0, .LBB1_79
+; CHECK-NEXT: bne- 0, .LBB1_79
; CHECK-NEXT: # %bb.80: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: std 23, ull@toc@l(26)
@@ -1667,7 +1667,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 23, 0, 6
; CHECK-NEXT: and 22, 3, 23
; CHECK-NEXT: stbcx. 22, 0, 6
-; CHECK-NEXT: bne 0, .LBB1_81
+; CHECK-NEXT: bne- 0, .LBB1_81
; CHECK-NEXT: # %bb.82: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stb 23, sc@toc@l(4)
@@ -1677,7 +1677,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 4, 0, 8
; CHECK-NEXT: and 6, 3, 4
; CHECK-NEXT: stbcx. 6, 0, 8
-; CHECK-NEXT: bne 0, .LBB1_83
+; CHECK-NEXT: bne- 0, .LBB1_83
; CHECK-NEXT: # %bb.84: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stb 4, uc@toc@l(5)
@@ -1687,7 +1687,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 4, 0, 10
; CHECK-NEXT: and 5, 3, 4
; CHECK-NEXT: sthcx. 5, 0, 10
-; CHECK-NEXT: bne 0, .LBB1_85
+; CHECK-NEXT: bne- 0, .LBB1_85
; CHECK-NEXT: # %bb.86: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sth 4, ss@toc@l(7)
@@ -1697,7 +1697,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 4, 0, 0
; CHECK-NEXT: and 5, 3, 4
; CHECK-NEXT: sthcx. 5, 0, 0
-; CHECK-NEXT: bne 0, .LBB1_87
+; CHECK-NEXT: bne- 0, .LBB1_87
; CHECK-NEXT: # %bb.88: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sth 4, us@toc@l(9)
@@ -1707,7 +1707,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 4, 0, 29
; CHECK-NEXT: and 5, 3, 4
; CHECK-NEXT: stwcx. 5, 0, 29
-; CHECK-NEXT: bne 0, .LBB1_89
+; CHECK-NEXT: bne- 0, .LBB1_89
; CHECK-NEXT: # %bb.90: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stw 4, si@toc@l(12)
@@ -1717,7 +1717,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 4, 0, 27
; CHECK-NEXT: and 5, 3, 4
; CHECK-NEXT: stwcx. 5, 0, 27
-; CHECK-NEXT: bne 0, .LBB1_91
+; CHECK-NEXT: bne- 0, .LBB1_91
; CHECK-NEXT: # %bb.92: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stw 4, ui@toc@l(30)
@@ -1727,7 +1727,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 3, 0, 25
; CHECK-NEXT: and 4, 11, 3
; CHECK-NEXT: stdcx. 4, 0, 25
-; CHECK-NEXT: bne 0, .LBB1_93
+; CHECK-NEXT: bne- 0, .LBB1_93
; CHECK-NEXT: # %bb.94: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: std 3, sll@toc@l(28)
@@ -1737,7 +1737,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 3, 0, 24
; CHECK-NEXT: and 4, 11, 3
; CHECK-NEXT: stdcx. 4, 0, 24
-; CHECK-NEXT: bne 0, .LBB1_95
+; CHECK-NEXT: bne- 0, .LBB1_95
; CHECK-NEXT: # %bb.96: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: std 3, ull@toc@l(26)
@@ -1794,7 +1794,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 5, 5, 6
; AIX32-NEXT: stwcx. 5, 0, 25
-; AIX32-NEXT: bne 0, L..BB1_1
+; AIX32-NEXT: bne- 0, L..BB1_1
; AIX32-NEXT: # %bb.2: # %entry
; AIX32-NEXT: srw 3, 4, 26
; AIX32-NEXT: lwsync
@@ -1817,7 +1817,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 5, 5, 6
; AIX32-NEXT: stwcx. 5, 0, 21
-; AIX32-NEXT: bne 0, L..BB1_3
+; AIX32-NEXT: bne- 0, L..BB1_3
; AIX32-NEXT: # %bb.4: # %entry
; AIX32-NEXT: srw 3, 4, 22
; AIX32-NEXT: lwz 23, L..C2(2) # @ss
@@ -1840,7 +1840,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 5, 5, 6
; AIX32-NEXT: stwcx. 5, 0, 17
-; AIX32-NEXT: bne 0, L..BB1_5
+; AIX32-NEXT: bne- 0, L..BB1_5
; AIX32-NEXT: # %bb.6: # %entry
; AIX32-NEXT: srw 3, 4, 18
; AIX32-NEXT: lwz 19, L..C3(2) # @us
@@ -1863,7 +1863,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 5, 5, 6
; AIX32-NEXT: stwcx. 5, 0, 14
-; AIX32-NEXT: bne 0, L..BB1_7
+; AIX32-NEXT: bne- 0, L..BB1_7
; AIX32-NEXT: # %bb.8: # %entry
; AIX32-NEXT: srw 3, 4, 15
; AIX32-NEXT: lwsync
@@ -1876,7 +1876,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 3, 0, 29
; AIX32-NEXT: addi 4, 3, 11
; AIX32-NEXT: stwcx. 4, 0, 29
-; AIX32-NEXT: bne 0, L..BB1_9
+; AIX32-NEXT: bne- 0, L..BB1_9
; AIX32-NEXT: # %bb.10: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: stw 3, 0(29)
@@ -1887,7 +1887,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 3, 0, 28
; AIX32-NEXT: addi 4, 3, 11
; AIX32-NEXT: stwcx. 4, 0, 28
-; AIX32-NEXT: bne 0, L..BB1_11
+; AIX32-NEXT: bne- 0, L..BB1_11
; AIX32-NEXT: # %bb.12: # %entry
; AIX32-NEXT: lwz 31, L..C6(2) # @sll
; AIX32-NEXT: lwsync
@@ -1920,7 +1920,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 5, 5, 6
; AIX32-NEXT: stwcx. 5, 0, 25
-; AIX32-NEXT: bne 0, L..BB1_13
+; AIX32-NEXT: bne- 0, L..BB1_13
; AIX32-NEXT: # %bb.14: # %entry
; AIX32-NEXT: srw 3, 4, 26
; AIX32-NEXT: lwsync
@@ -1938,7 +1938,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 5, 5, 6
; AIX32-NEXT: stwcx. 5, 0, 21
-; AIX32-NEXT: bne 0, L..BB1_15
+; AIX32-NEXT: bne- 0, L..BB1_15
; AIX32-NEXT: # %bb.16: # %entry
; AIX32-NEXT: srw 3, 4, 22
; AIX32-NEXT: lwsync
@@ -1957,7 +1957,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 5, 5, 6
; AIX32-NEXT: stwcx. 5, 0, 17
-; AIX32-NEXT: bne 0, L..BB1_17
+; AIX32-NEXT: bne- 0, L..BB1_17
; AIX32-NEXT: # %bb.18: # %entry
; AIX32-NEXT: srw 3, 4, 18
; AIX32-NEXT: lwsync
@@ -1975,7 +1975,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 5, 5, 6
; AIX32-NEXT: stwcx. 5, 0, 14
-; AIX32-NEXT: bne 0, L..BB1_19
+; AIX32-NEXT: bne- 0, L..BB1_19
; AIX32-NEXT: # %bb.20: # %entry
; AIX32-NEXT: srw 3, 4, 15
; AIX32-NEXT: lwsync
@@ -1987,7 +1987,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 3, 0, 29
; AIX32-NEXT: sub 4, 3, 7
; AIX32-NEXT: stwcx. 4, 0, 29
-; AIX32-NEXT: bne 0, L..BB1_21
+; AIX32-NEXT: bne- 0, L..BB1_21
; AIX32-NEXT: # %bb.22: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: stw 3, 0(29)
@@ -1997,7 +1997,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 3, 0, 28
; AIX32-NEXT: sub 4, 3, 7
; AIX32-NEXT: stwcx. 4, 0, 28
-; AIX32-NEXT: bne 0, L..BB1_23
+; AIX32-NEXT: bne- 0, L..BB1_23
; AIX32-NEXT: # %bb.24: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: li 4, 0
@@ -2028,7 +2028,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 5, 5, 6
; AIX32-NEXT: stwcx. 5, 0, 25
-; AIX32-NEXT: bne 0, L..BB1_25
+; AIX32-NEXT: bne- 0, L..BB1_25
; AIX32-NEXT: # %bb.26: # %entry
; AIX32-NEXT: srw 3, 4, 26
; AIX32-NEXT: lwsync
@@ -2046,7 +2046,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 5, 5, 6
; AIX32-NEXT: stwcx. 5, 0, 21
-; AIX32-NEXT: bne 0, L..BB1_27
+; AIX32-NEXT: bne- 0, L..BB1_27
; AIX32-NEXT: # %bb.28: # %entry
; AIX32-NEXT: srw 3, 4, 22
; AIX32-NEXT: lwsync
@@ -2064,7 +2064,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 5, 5, 6
; AIX32-NEXT: stwcx. 5, 0, 17
-; AIX32-NEXT: bne 0, L..BB1_29
+; AIX32-NEXT: bne- 0, L..BB1_29
; AIX32-NEXT: # %bb.30: # %entry
; AIX32-NEXT: srw 3, 4, 18
; AIX32-NEXT: lwsync
@@ -2082,7 +2082,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 5, 5, 6
; AIX32-NEXT: stwcx. 5, 0, 14
-; AIX32-NEXT: bne 0, L..BB1_31
+; AIX32-NEXT: bne- 0, L..BB1_31
; AIX32-NEXT: # %bb.32: # %entry
; AIX32-NEXT: srw 3, 4, 15
; AIX32-NEXT: lwsync
@@ -2094,7 +2094,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 3, 0, 29
; AIX32-NEXT: ori 4, 3, 11
; AIX32-NEXT: stwcx. 4, 0, 29
-; AIX32-NEXT: bne 0, L..BB1_33
+; AIX32-NEXT: bne- 0, L..BB1_33
; AIX32-NEXT: # %bb.34: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: stw 3, 0(29)
@@ -2104,7 +2104,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 3, 0, 28
; AIX32-NEXT: ori 4, 3, 11
; AIX32-NEXT: stwcx. 4, 0, 28
-; AIX32-NEXT: bne 0, L..BB1_35
+; AIX32-NEXT: bne- 0, L..BB1_35
; AIX32-NEXT: # %bb.36: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: li 4, 0
@@ -2135,7 +2135,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 5, 5, 6
; AIX32-NEXT: stwcx. 5, 0, 25
-; AIX32-NEXT: bne 0, L..BB1_37
+; AIX32-NEXT: bne- 0, L..BB1_37
; AIX32-NEXT: # %bb.38: # %entry
; AIX32-NEXT: srw 3, 4, 26
; AIX32-NEXT: lwsync
@@ -2153,7 +2153,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 5, 5, 6
; AIX32-NEXT: stwcx. 5, 0, 21
-; AIX32-NEXT: bne 0, L..BB1_39
+; AIX32-NEXT: bne- 0, L..BB1_39
; AIX32-NEXT: # %bb.40: # %entry
; AIX32-NEXT: srw 3, 4, 22
; AIX32-NEXT: lwsync
@@ -2171,7 +2171,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 5, 5, 6
; AIX32-NEXT: stwcx. 5, 0, 17
-; AIX32-NEXT: bne 0, L..BB1_41
+; AIX32-NEXT: bne- 0, L..BB1_41
; AIX32-NEXT: # %bb.42: # %entry
; AIX32-NEXT: srw 3, 4, 18
; AIX32-NEXT: lwsync
@@ -2189,7 +2189,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 5, 5, 6
; AIX32-NEXT: stwcx. 5, 0, 14
-; AIX32-NEXT: bne 0, L..BB1_43
+; AIX32-NEXT: bne- 0, L..BB1_43
; AIX32-NEXT: # %bb.44: # %entry
; AIX32-NEXT: srw 3, 4, 15
; AIX32-NEXT: lwsync
@@ -2201,7 +2201,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 3, 0, 29
; AIX32-NEXT: xori 4, 3, 11
; AIX32-NEXT: stwcx. 4, 0, 29
-; AIX32-NEXT: bne 0, L..BB1_45
+; AIX32-NEXT: bne- 0, L..BB1_45
; AIX32-NEXT: # %bb.46: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: stw 3, 0(29)
@@ -2211,7 +2211,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 3, 0, 28
; AIX32-NEXT: xori 4, 3, 11
; AIX32-NEXT: stwcx. 4, 0, 28
-; AIX32-NEXT: bne 0, L..BB1_47
+; AIX32-NEXT: bne- 0, L..BB1_47
; AIX32-NEXT: # %bb.48: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: li 4, 0
@@ -2242,7 +2242,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 5, 5, 6
; AIX32-NEXT: stwcx. 5, 0, 25
-; AIX32-NEXT: bne 0, L..BB1_49
+; AIX32-NEXT: bne- 0, L..BB1_49
; AIX32-NEXT: # %bb.50: # %entry
; AIX32-NEXT: srw 3, 4, 26
; AIX32-NEXT: lwsync
@@ -2261,7 +2261,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 5, 5, 6
; AIX32-NEXT: stwcx. 5, 0, 21
-; AIX32-NEXT: bne 0, L..BB1_51
+; AIX32-NEXT: bne- 0, L..BB1_51
; AIX32-NEXT: # %bb.52: # %entry
; AIX32-NEXT: srw 3, 4, 22
; AIX32-NEXT: lwsync
@@ -2279,7 +2279,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 5, 5, 6
; AIX32-NEXT: stwcx. 5, 0, 17
-; AIX32-NEXT: bne 0, L..BB1_53
+; AIX32-NEXT: bne- 0, L..BB1_53
; AIX32-NEXT: # %bb.54: # %entry
; AIX32-NEXT: srw 3, 4, 18
; AIX32-NEXT: lwsync
@@ -2297,7 +2297,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 5, 5, 6
; AIX32-NEXT: stwcx. 5, 0, 14
-; AIX32-NEXT: bne 0, L..BB1_55
+; AIX32-NEXT: bne- 0, L..BB1_55
; AIX32-NEXT: # %bb.56: # %entry
; AIX32-NEXT: srw 3, 4, 15
; AIX32-NEXT: lwsync
@@ -2309,7 +2309,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 3, 0, 29
; AIX32-NEXT: nand 4, 7, 3
; AIX32-NEXT: stwcx. 4, 0, 29
-; AIX32-NEXT: bne 0, L..BB1_57
+; AIX32-NEXT: bne- 0, L..BB1_57
; AIX32-NEXT: # %bb.58: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: stw 3, 0(29)
@@ -2319,7 +2319,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 3, 0, 28
; AIX32-NEXT: nand 4, 7, 3
; AIX32-NEXT: stwcx. 4, 0, 28
-; AIX32-NEXT: bne 0, L..BB1_59
+; AIX32-NEXT: bne- 0, L..BB1_59
; AIX32-NEXT: # %bb.60: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: li 4, 0
@@ -2350,7 +2350,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 5, 5, 6
; AIX32-NEXT: stwcx. 5, 0, 25
-; AIX32-NEXT: bne 0, L..BB1_61
+; AIX32-NEXT: bne- 0, L..BB1_61
; AIX32-NEXT: # %bb.62: # %entry
; AIX32-NEXT: srw 3, 4, 26
; AIX32-NEXT: lwsync
@@ -2368,7 +2368,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 5, 5, 6
; AIX32-NEXT: stwcx. 5, 0, 21
-; AIX32-NEXT: bne 0, L..BB1_63
+; AIX32-NEXT: bne- 0, L..BB1_63
; AIX32-NEXT: # %bb.64: # %entry
; AIX32-NEXT: srw 3, 4, 22
; AIX32-NEXT: lwsync
@@ -2387,7 +2387,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 5, 5, 6
; AIX32-NEXT: stwcx. 5, 0, 17
-; AIX32-NEXT: bne 0, L..BB1_65
+; AIX32-NEXT: bne- 0, L..BB1_65
; AIX32-NEXT: # %bb.66: # %entry
; AIX32-NEXT: srw 3, 4, 18
; AIX32-NEXT: lwsync
@@ -2405,7 +2405,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: and 5, 5, 3
; AIX32-NEXT: or 5, 5, 6
; AIX32-NEXT: stwcx. 5, 0, 14
-; AIX32-NEXT: bne 0, L..BB1_67
+; AIX32-NEXT: bne- 0, L..BB1_67
; AIX32-NEXT: # %bb.68: # %entry
; AIX32-NEXT: srw 3, 4, 15
; AIX32-NEXT: lwsync
@@ -2417,7 +2417,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 3, 0, 29
; AIX32-NEXT: and 4, 7, 3
; AIX32-NEXT: stwcx. 4, 0, 29
-; AIX32-NEXT: bne 0, L..BB1_69
+; AIX32-NEXT: bne- 0, L..BB1_69
; AIX32-NEXT: # %bb.70: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: stw 3, 0(29)
@@ -2427,7 +2427,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 3, 0, 28
; AIX32-NEXT: and 4, 7, 3
; AIX32-NEXT: stwcx. 4, 0, 28
-; AIX32-NEXT: bne 0, L..BB1_71
+; AIX32-NEXT: bne- 0, L..BB1_71
; AIX32-NEXT: # %bb.72: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: li 4, 0
@@ -2599,7 +2599,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 8, 0, 7
; CHECK-NEXT: add 8, 6, 8
; CHECK-NEXT: stbcx. 8, 0, 7
-; CHECK-NEXT: bne 0, .LBB2_1
+; CHECK-NEXT: bne- 0, .LBB2_1
; CHECK-NEXT: # %bb.2: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stb 8, sc@toc@l(5)
@@ -2610,7 +2610,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 8, 0, 4
; CHECK-NEXT: add 8, 6, 8
; CHECK-NEXT: stbcx. 8, 0, 4
-; CHECK-NEXT: bne 0, .LBB2_3
+; CHECK-NEXT: bne- 0, .LBB2_3
; CHECK-NEXT: # %bb.4: # %entry
; CHECK-NEXT: addis 6, 2, ss@toc@ha
; CHECK-NEXT: lwsync
@@ -2623,7 +2623,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 10, 0, 9
; CHECK-NEXT: add 10, 8, 10
; CHECK-NEXT: sthcx. 10, 0, 9
-; CHECK-NEXT: bne 0, .LBB2_5
+; CHECK-NEXT: bne- 0, .LBB2_5
; CHECK-NEXT: # %bb.6: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: addis 8, 2, us@toc@ha
@@ -2636,7 +2636,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 12, 0, 11
; CHECK-NEXT: add 12, 10, 12
; CHECK-NEXT: sthcx. 12, 0, 11
-; CHECK-NEXT: bne 0, .LBB2_7
+; CHECK-NEXT: bne- 0, .LBB2_7
; CHECK-NEXT: # %bb.8: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: addis 10, 2, si@toc@ha
@@ -2649,7 +2649,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 30, 0, 0
; CHECK-NEXT: add 30, 12, 30
; CHECK-NEXT: stwcx. 30, 0, 0
-; CHECK-NEXT: bne 0, .LBB2_9
+; CHECK-NEXT: bne- 0, .LBB2_9
; CHECK-NEXT: # %bb.10: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: addis 12, 2, ui@toc@ha
@@ -2662,7 +2662,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 28, 0, 29
; CHECK-NEXT: add 28, 30, 28
; CHECK-NEXT: stwcx. 28, 0, 29
-; CHECK-NEXT: bne 0, .LBB2_11
+; CHECK-NEXT: bne- 0, .LBB2_11
; CHECK-NEXT: # %bb.12: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: addis 30, 2, sll@toc@ha
@@ -2675,7 +2675,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 26, 0, 27
; CHECK-NEXT: add 26, 28, 26
; CHECK-NEXT: stdcx. 26, 0, 27
-; CHECK-NEXT: bne 0, .LBB2_13
+; CHECK-NEXT: bne- 0, .LBB2_13
; CHECK-NEXT: # %bb.14: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: addis 28, 2, ull@toc@ha
@@ -2688,7 +2688,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 24, 0, 26
; CHECK-NEXT: add 24, 25, 24
; CHECK-NEXT: stdcx. 24, 0, 26
-; CHECK-NEXT: bne 0, .LBB2_15
+; CHECK-NEXT: bne- 0, .LBB2_15
; CHECK-NEXT: # %bb.16: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: std 24, ull@toc@l(28)
@@ -2699,7 +2699,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 24, 0, 7
; CHECK-NEXT: sub 24, 24, 25
; CHECK-NEXT: stbcx. 24, 0, 7
-; CHECK-NEXT: bne 0, .LBB2_17
+; CHECK-NEXT: bne- 0, .LBB2_17
; CHECK-NEXT: # %bb.18: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stb 24, sc@toc@l(5)
@@ -2710,7 +2710,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 24, 0, 4
; CHECK-NEXT: sub 24, 24, 25
; CHECK-NEXT: stbcx. 24, 0, 4
-; CHECK-NEXT: bne 0, .LBB2_19
+; CHECK-NEXT: bne- 0, .LBB2_19
; CHECK-NEXT: # %bb.20: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stb 24, uc@toc@l(3)
@@ -2721,7 +2721,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 24, 0, 9
; CHECK-NEXT: sub 24, 24, 25
; CHECK-NEXT: sthcx. 24, 0, 9
-; CHECK-NEXT: bne 0, .LBB2_21
+; CHECK-NEXT: bne- 0, .LBB2_21
; CHECK-NEXT: # %bb.22: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sth 24, ss@toc@l(6)
@@ -2732,7 +2732,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 24, 0, 11
; CHECK-NEXT: sub 24, 24, 25
; CHECK-NEXT: sthcx. 24, 0, 11
-; CHECK-NEXT: bne 0, .LBB2_23
+; CHECK-NEXT: bne- 0, .LBB2_23
; CHECK-NEXT: # %bb.24: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sth 24, us@toc@l(8)
@@ -2743,7 +2743,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 24, 0, 0
; CHECK-NEXT: sub 24, 24, 25
; CHECK-NEXT: stwcx. 24, 0, 0
-; CHECK-NEXT: bne 0, .LBB2_25
+; CHECK-NEXT: bne- 0, .LBB2_25
; CHECK-NEXT: # %bb.26: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stw 24, si@toc@l(10)
@@ -2754,7 +2754,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 24, 0, 29
; CHECK-NEXT: sub 24, 24, 25
; CHECK-NEXT: stwcx. 24, 0, 29
-; CHECK-NEXT: bne 0, .LBB2_27
+; CHECK-NEXT: bne- 0, .LBB2_27
; CHECK-NEXT: # %bb.28: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stw 24, ui@toc@l(12)
@@ -2765,7 +2765,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 24, 0, 27
; CHECK-NEXT: sub 24, 24, 25
; CHECK-NEXT: stdcx. 24, 0, 27
-; CHECK-NEXT: bne 0, .LBB2_29
+; CHECK-NEXT: bne- 0, .LBB2_29
; CHECK-NEXT: # %bb.30: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: std 24, sll@toc@l(30)
@@ -2776,7 +2776,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 24, 0, 26
; CHECK-NEXT: sub 24, 24, 25
; CHECK-NEXT: stdcx. 24, 0, 26
-; CHECK-NEXT: bne 0, .LBB2_31
+; CHECK-NEXT: bne- 0, .LBB2_31
; CHECK-NEXT: # %bb.32: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: std 24, ull@toc@l(28)
@@ -2787,7 +2787,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 24, 0, 7
; CHECK-NEXT: or 24, 25, 24
; CHECK-NEXT: stbcx. 24, 0, 7
-; CHECK-NEXT: bne 0, .LBB2_33
+; CHECK-NEXT: bne- 0, .LBB2_33
; CHECK-NEXT: # %bb.34: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stb 24, sc@toc@l(5)
@@ -2798,7 +2798,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 24, 0, 4
; CHECK-NEXT: or 24, 25, 24
; CHECK-NEXT: stbcx. 24, 0, 4
-; CHECK-NEXT: bne 0, .LBB2_35
+; CHECK-NEXT: bne- 0, .LBB2_35
; CHECK-NEXT: # %bb.36: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stb 24, uc@toc@l(3)
@@ -2809,7 +2809,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 24, 0, 9
; CHECK-NEXT: or 24, 25, 24
; CHECK-NEXT: sthcx. 24, 0, 9
-; CHECK-NEXT: bne 0, .LBB2_37
+; CHECK-NEXT: bne- 0, .LBB2_37
; CHECK-NEXT: # %bb.38: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sth 24, ss@toc@l(6)
@@ -2820,7 +2820,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 24, 0, 11
; CHECK-NEXT: or 24, 25, 24
; CHECK-NEXT: sthcx. 24, 0, 11
-; CHECK-NEXT: bne 0, .LBB2_39
+; CHECK-NEXT: bne- 0, .LBB2_39
; CHECK-NEXT: # %bb.40: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sth 24, us@toc@l(8)
@@ -2831,7 +2831,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 24, 0, 0
; CHECK-NEXT: or 24, 25, 24
; CHECK-NEXT: stwcx. 24, 0, 0
-; CHECK-NEXT: bne 0, .LBB2_41
+; CHECK-NEXT: bne- 0, .LBB2_41
; CHECK-NEXT: # %bb.42: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stw 24, si@toc@l(10)
@@ -2842,7 +2842,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 24, 0, 29
; CHECK-NEXT: or 24, 25, 24
; CHECK-NEXT: stwcx. 24, 0, 29
-; CHECK-NEXT: bne 0, .LBB2_43
+; CHECK-NEXT: bne- 0, .LBB2_43
; CHECK-NEXT: # %bb.44: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stw 24, ui@toc@l(12)
@@ -2853,7 +2853,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 24, 0, 27
; CHECK-NEXT: or 24, 25, 24
; CHECK-NEXT: stdcx. 24, 0, 27
-; CHECK-NEXT: bne 0, .LBB2_45
+; CHECK-NEXT: bne- 0, .LBB2_45
; CHECK-NEXT: # %bb.46: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: std 24, sll@toc@l(30)
@@ -2864,7 +2864,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 24, 0, 26
; CHECK-NEXT: or 24, 25, 24
; CHECK-NEXT: stdcx. 24, 0, 26
-; CHECK-NEXT: bne 0, .LBB2_47
+; CHECK-NEXT: bne- 0, .LBB2_47
; CHECK-NEXT: # %bb.48: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: std 24, ull@toc@l(28)
@@ -2875,7 +2875,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 24, 0, 7
; CHECK-NEXT: xor 24, 25, 24
; CHECK-NEXT: stbcx. 24, 0, 7
-; CHECK-NEXT: bne 0, .LBB2_49
+; CHECK-NEXT: bne- 0, .LBB2_49
; CHECK-NEXT: # %bb.50: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stb 24, sc@toc@l(5)
@@ -2886,7 +2886,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 24, 0, 4
; CHECK-NEXT: xor 24, 25, 24
; CHECK-NEXT: stbcx. 24, 0, 4
-; CHECK-NEXT: bne 0, .LBB2_51
+; CHECK-NEXT: bne- 0, .LBB2_51
; CHECK-NEXT: # %bb.52: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stb 24, uc@toc@l(3)
@@ -2897,7 +2897,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 24, 0, 9
; CHECK-NEXT: xor 24, 25, 24
; CHECK-NEXT: sthcx. 24, 0, 9
-; CHECK-NEXT: bne 0, .LBB2_53
+; CHECK-NEXT: bne- 0, .LBB2_53
; CHECK-NEXT: # %bb.54: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sth 24, ss@toc@l(6)
@@ -2908,7 +2908,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 24, 0, 11
; CHECK-NEXT: xor 24, 25, 24
; CHECK-NEXT: sthcx. 24, 0, 11
-; CHECK-NEXT: bne 0, .LBB2_55
+; CHECK-NEXT: bne- 0, .LBB2_55
; CHECK-NEXT: # %bb.56: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sth 24, us@toc@l(8)
@@ -2919,7 +2919,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 24, 0, 0
; CHECK-NEXT: xor 24, 25, 24
; CHECK-NEXT: stwcx. 24, 0, 0
-; CHECK-NEXT: bne 0, .LBB2_57
+; CHECK-NEXT: bne- 0, .LBB2_57
; CHECK-NEXT: # %bb.58: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stw 24, si@toc@l(10)
@@ -2930,7 +2930,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 24, 0, 29
; CHECK-NEXT: xor 24, 25, 24
; CHECK-NEXT: stwcx. 24, 0, 29
-; CHECK-NEXT: bne 0, .LBB2_59
+; CHECK-NEXT: bne- 0, .LBB2_59
; CHECK-NEXT: # %bb.60: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stw 24, ui@toc@l(12)
@@ -2941,7 +2941,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 24, 0, 27
; CHECK-NEXT: xor 24, 25, 24
; CHECK-NEXT: stdcx. 24, 0, 27
-; CHECK-NEXT: bne 0, .LBB2_61
+; CHECK-NEXT: bne- 0, .LBB2_61
; CHECK-NEXT: # %bb.62: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: std 24, sll@toc@l(30)
@@ -2952,7 +2952,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 24, 0, 26
; CHECK-NEXT: xor 24, 25, 24
; CHECK-NEXT: stdcx. 24, 0, 26
-; CHECK-NEXT: bne 0, .LBB2_63
+; CHECK-NEXT: bne- 0, .LBB2_63
; CHECK-NEXT: # %bb.64: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: std 24, ull@toc@l(28)
@@ -2963,7 +2963,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 24, 0, 7
; CHECK-NEXT: nand 24, 25, 24
; CHECK-NEXT: stbcx. 24, 0, 7
-; CHECK-NEXT: bne 0, .LBB2_65
+; CHECK-NEXT: bne- 0, .LBB2_65
; CHECK-NEXT: # %bb.66: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stb 24, sc@toc@l(5)
@@ -2974,7 +2974,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 24, 0, 4
; CHECK-NEXT: nand 24, 25, 24
; CHECK-NEXT: stbcx. 24, 0, 4
-; CHECK-NEXT: bne 0, .LBB2_67
+; CHECK-NEXT: bne- 0, .LBB2_67
; CHECK-NEXT: # %bb.68: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stb 24, uc@toc@l(3)
@@ -2985,7 +2985,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 24, 0, 9
; CHECK-NEXT: nand 24, 25, 24
; CHECK-NEXT: sthcx. 24, 0, 9
-; CHECK-NEXT: bne 0, .LBB2_69
+; CHECK-NEXT: bne- 0, .LBB2_69
; CHECK-NEXT: # %bb.70: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sth 24, ss@toc@l(6)
@@ -2996,7 +2996,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 24, 0, 11
; CHECK-NEXT: nand 24, 25, 24
; CHECK-NEXT: sthcx. 24, 0, 11
-; CHECK-NEXT: bne 0, .LBB2_71
+; CHECK-NEXT: bne- 0, .LBB2_71
; CHECK-NEXT: # %bb.72: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sth 24, us@toc@l(8)
@@ -3007,7 +3007,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 24, 0, 0
; CHECK-NEXT: nand 24, 25, 24
; CHECK-NEXT: stwcx. 24, 0, 0
-; CHECK-NEXT: bne 0, .LBB2_73
+; CHECK-NEXT: bne- 0, .LBB2_73
; CHECK-NEXT: # %bb.74: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stw 24, si@toc@l(10)
@@ -3018,7 +3018,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 24, 0, 29
; CHECK-NEXT: nand 24, 25, 24
; CHECK-NEXT: stwcx. 24, 0, 29
-; CHECK-NEXT: bne 0, .LBB2_75
+; CHECK-NEXT: bne- 0, .LBB2_75
; CHECK-NEXT: # %bb.76: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stw 24, ui@toc@l(12)
@@ -3029,7 +3029,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 24, 0, 27
; CHECK-NEXT: nand 24, 25, 24
; CHECK-NEXT: stdcx. 24, 0, 27
-; CHECK-NEXT: bne 0, .LBB2_77
+; CHECK-NEXT: bne- 0, .LBB2_77
; CHECK-NEXT: # %bb.78: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: std 24, sll@toc@l(30)
@@ -3040,7 +3040,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 24, 0, 26
; CHECK-NEXT: nand 24, 25, 24
; CHECK-NEXT: stdcx. 24, 0, 26
-; CHECK-NEXT: bne 0, .LBB2_79
+; CHECK-NEXT: bne- 0, .LBB2_79
; CHECK-NEXT: # %bb.80: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: std 24, ull@toc@l(28)
@@ -3085,7 +3085,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 24, 0, 7
; CHECK-NEXT: and 24, 25, 24
; CHECK-NEXT: stbcx. 24, 0, 7
-; CHECK-NEXT: bne 0, .LBB2_85
+; CHECK-NEXT: bne- 0, .LBB2_85
; CHECK-NEXT: # %bb.86: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stb 24, sc@toc@l(5)
@@ -3096,7 +3096,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lbarx 5, 0, 4
; CHECK-NEXT: and 5, 7, 5
; CHECK-NEXT: stbcx. 5, 0, 4
-; CHECK-NEXT: bne 0, .LBB2_87
+; CHECK-NEXT: bne- 0, .LBB2_87
; CHECK-NEXT: # %bb.88: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stb 5, uc@toc@l(3)
@@ -3106,7 +3106,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 4, 0, 9
; CHECK-NEXT: and 4, 5, 4
; CHECK-NEXT: sthcx. 4, 0, 9
-; CHECK-NEXT: bne 0, .LBB2_89
+; CHECK-NEXT: bne- 0, .LBB2_89
; CHECK-NEXT: # %bb.90: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sth 4, ss@toc@l(6)
@@ -3117,7 +3117,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lharx 5, 0, 11
; CHECK-NEXT: and 5, 4, 5
; CHECK-NEXT: sthcx. 5, 0, 11
-; CHECK-NEXT: bne 0, .LBB2_91
+; CHECK-NEXT: bne- 0, .LBB2_91
; CHECK-NEXT: # %bb.92: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sth 5, us@toc@l(8)
@@ -3128,7 +3128,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 5, 0, 0
; CHECK-NEXT: and 5, 4, 5
; CHECK-NEXT: stwcx. 5, 0, 0
-; CHECK-NEXT: bne 0, .LBB2_93
+; CHECK-NEXT: bne- 0, .LBB2_93
; CHECK-NEXT: # %bb.94: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stw 5, si@toc@l(10)
@@ -3139,7 +3139,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: lwarx 5, 0, 29
; CHECK-NEXT: and 5, 4, 5
; CHECK-NEXT: stwcx. 5, 0, 29
-; CHECK-NEXT: bne 0, .LBB2_95
+; CHECK-NEXT: bne- 0, .LBB2_95
; CHECK-NEXT: # %bb.96: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stw 5, ui@toc@l(12)
@@ -3150,7 +3150,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 5, 0, 27
; CHECK-NEXT: and 5, 4, 5
; CHECK-NEXT: stdcx. 5, 0, 27
-; CHECK-NEXT: bne 0, .LBB2_97
+; CHECK-NEXT: bne- 0, .LBB2_97
; CHECK-NEXT: # %bb.98: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: std 5, sll@toc@l(30)
@@ -3161,7 +3161,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; CHECK-NEXT: ldarx 4, 0, 26
; CHECK-NEXT: and 4, 3, 4
; CHECK-NEXT: stdcx. 4, 0, 26
-; CHECK-NEXT: bne 0, .LBB2_99
+; CHECK-NEXT: bne- 0, .LBB2_99
; CHECK-NEXT: # %bb.100: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: std 4, ull@toc@l(28)
@@ -3225,7 +3225,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: and 7, 7, 5
; AIX32-NEXT: or 7, 7, 8
; AIX32-NEXT: stwcx. 7, 0, 22
-; AIX32-NEXT: bne 0, L..BB2_1
+; AIX32-NEXT: bne- 0, L..BB2_1
; AIX32-NEXT: # %bb.2: # %entry
; AIX32-NEXT: srw 4, 6, 24
; AIX32-NEXT: lwsync
@@ -3248,7 +3248,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: and 7, 7, 5
; AIX32-NEXT: or 7, 7, 8
; AIX32-NEXT: stwcx. 7, 0, 19
-; AIX32-NEXT: bne 0, L..BB2_3
+; AIX32-NEXT: bne- 0, L..BB2_3
; AIX32-NEXT: # %bb.4: # %entry
; AIX32-NEXT: srw 4, 6, 21
; AIX32-NEXT: lwz 23, L..C2(2) # @ss
@@ -3273,7 +3273,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: and 7, 7, 5
; AIX32-NEXT: or 7, 7, 8
; AIX32-NEXT: stwcx. 7, 0, 16
-; AIX32-NEXT: bne 0, L..BB2_5
+; AIX32-NEXT: bne- 0, L..BB2_5
; AIX32-NEXT: # %bb.6: # %entry
; AIX32-NEXT: srw 4, 6, 18
; AIX32-NEXT: lwz 20, L..C3(2) # @us
@@ -3298,7 +3298,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: and 7, 7, 5
; AIX32-NEXT: or 7, 7, 8
; AIX32-NEXT: stwcx. 7, 0, 14
-; AIX32-NEXT: bne 0, L..BB2_7
+; AIX32-NEXT: bne- 0, L..BB2_7
; AIX32-NEXT: # %bb.8: # %entry
; AIX32-NEXT: srw 4, 6, 15
; AIX32-NEXT: lwsync
@@ -3313,7 +3313,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 4, 0, 13
; AIX32-NEXT: add 4, 3, 4
; AIX32-NEXT: stwcx. 4, 0, 13
-; AIX32-NEXT: bne 0, L..BB2_9
+; AIX32-NEXT: bne- 0, L..BB2_9
; AIX32-NEXT: # %bb.10: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: stw 4, 0(13)
@@ -3325,7 +3325,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 4, 0, 25
; AIX32-NEXT: add 4, 3, 4
; AIX32-NEXT: stwcx. 4, 0, 25
-; AIX32-NEXT: bne 0, L..BB2_11
+; AIX32-NEXT: bne- 0, L..BB2_11
; AIX32-NEXT: # %bb.12: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: lwz 31, L..C6(2) # @sll
@@ -3367,7 +3367,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: and 7, 7, 5
; AIX32-NEXT: or 7, 7, 8
; AIX32-NEXT: stwcx. 7, 0, 22
-; AIX32-NEXT: bne 0, L..BB2_13
+; AIX32-NEXT: bne- 0, L..BB2_13
; AIX32-NEXT: # %bb.14: # %entry
; AIX32-NEXT: srw 4, 6, 24
; AIX32-NEXT: lwsync
@@ -3387,7 +3387,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: and 7, 7, 5
; AIX32-NEXT: or 7, 7, 8
; AIX32-NEXT: stwcx. 7, 0, 19
-; AIX32-NEXT: bne 0, L..BB2_15
+; AIX32-NEXT: bne- 0, L..BB2_15
; AIX32-NEXT: # %bb.16: # %entry
; AIX32-NEXT: srw 4, 6, 21
; AIX32-NEXT: li 5, 0
@@ -3408,7 +3408,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: and 7, 7, 5
; AIX32-NEXT: or 7, 7, 8
; AIX32-NEXT: stwcx. 7, 0, 16
-; AIX32-NEXT: bne 0, L..BB2_17
+; AIX32-NEXT: bne- 0, L..BB2_17
; AIX32-NEXT: # %bb.18: # %entry
; AIX32-NEXT: srw 4, 6, 18
; AIX32-NEXT: lwsync
@@ -3429,7 +3429,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: and 7, 7, 5
; AIX32-NEXT: or 7, 7, 8
; AIX32-NEXT: stwcx. 7, 0, 14
-; AIX32-NEXT: bne 0, L..BB2_19
+; AIX32-NEXT: bne- 0, L..BB2_19
; AIX32-NEXT: # %bb.20: # %entry
; AIX32-NEXT: srw 4, 6, 15
; AIX32-NEXT: lwsync
@@ -3443,7 +3443,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 4, 0, 13
; AIX32-NEXT: sub 4, 4, 3
; AIX32-NEXT: stwcx. 4, 0, 13
-; AIX32-NEXT: bne 0, L..BB2_21
+; AIX32-NEXT: bne- 0, L..BB2_21
; AIX32-NEXT: # %bb.22: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: stw 4, 0(13)
@@ -3454,7 +3454,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 4, 0, 25
; AIX32-NEXT: sub 4, 4, 3
; AIX32-NEXT: stwcx. 4, 0, 25
-; AIX32-NEXT: bne 0, L..BB2_23
+; AIX32-NEXT: bne- 0, L..BB2_23
; AIX32-NEXT: # %bb.24: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: stw 4, 0(25)
@@ -3493,7 +3493,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: and 7, 7, 5
; AIX32-NEXT: or 7, 7, 8
; AIX32-NEXT: stwcx. 7, 0, 22
-; AIX32-NEXT: bne 0, L..BB2_25
+; AIX32-NEXT: bne- 0, L..BB2_25
; AIX32-NEXT: # %bb.26: # %entry
; AIX32-NEXT: srw 4, 6, 24
; AIX32-NEXT: lwsync
@@ -3513,7 +3513,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: and 7, 7, 5
; AIX32-NEXT: or 7, 7, 8
; AIX32-NEXT: stwcx. 7, 0, 19
-; AIX32-NEXT: bne 0, L..BB2_27
+; AIX32-NEXT: bne- 0, L..BB2_27
; AIX32-NEXT: # %bb.28: # %entry
; AIX32-NEXT: srw 4, 6, 21
; AIX32-NEXT: li 5, 0
@@ -3534,7 +3534,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: and 7, 7, 5
; AIX32-NEXT: or 7, 7, 8
; AIX32-NEXT: stwcx. 7, 0, 16
-; AIX32-NEXT: bne 0, L..BB2_29
+; AIX32-NEXT: bne- 0, L..BB2_29
; AIX32-NEXT: # %bb.30: # %entry
; AIX32-NEXT: srw 4, 6, 18
; AIX32-NEXT: lwsync
@@ -3555,7 +3555,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: and 7, 7, 5
; AIX32-NEXT: or 7, 7, 8
; AIX32-NEXT: stwcx. 7, 0, 14
-; AIX32-NEXT: bne 0, L..BB2_31
+; AIX32-NEXT: bne- 0, L..BB2_31
; AIX32-NEXT: # %bb.32: # %entry
; AIX32-NEXT: srw 4, 6, 15
; AIX32-NEXT: lwsync
@@ -3569,7 +3569,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 4, 0, 13
; AIX32-NEXT: or 4, 3, 4
; AIX32-NEXT: stwcx. 4, 0, 13
-; AIX32-NEXT: bne 0, L..BB2_33
+; AIX32-NEXT: bne- 0, L..BB2_33
; AIX32-NEXT: # %bb.34: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: stw 4, 0(13)
@@ -3580,7 +3580,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 4, 0, 25
; AIX32-NEXT: or 4, 3, 4
; AIX32-NEXT: stwcx. 4, 0, 25
-; AIX32-NEXT: bne 0, L..BB2_35
+; AIX32-NEXT: bne- 0, L..BB2_35
; AIX32-NEXT: # %bb.36: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: stw 4, 0(25)
@@ -3617,7 +3617,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: and 7, 7, 5
; AIX32-NEXT: or 7, 7, 8
; AIX32-NEXT: stwcx. 7, 0, 22
-; AIX32-NEXT: bne 0, L..BB2_37
+; AIX32-NEXT: bne- 0, L..BB2_37
; AIX32-NEXT: # %bb.38: # %entry
; AIX32-NEXT: srw 4, 6, 24
; AIX32-NEXT: lwsync
@@ -3637,7 +3637,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: and 7, 7, 5
; AIX32-NEXT: or 7, 7, 8
; AIX32-NEXT: stwcx. 7, 0, 19
-; AIX32-NEXT: bne 0, L..BB2_39
+; AIX32-NEXT: bne- 0, L..BB2_39
; AIX32-NEXT: # %bb.40: # %entry
; AIX32-NEXT: srw 4, 6, 21
; AIX32-NEXT: li 5, 0
@@ -3658,7 +3658,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: and 7, 7, 5
; AIX32-NEXT: or 7, 7, 8
; AIX32-NEXT: stwcx. 7, 0, 16
-; AIX32-NEXT: bne 0, L..BB2_41
+; AIX32-NEXT: bne- 0, L..BB2_41
; AIX32-NEXT: # %bb.42: # %entry
; AIX32-NEXT: srw 4, 6, 18
; AIX32-NEXT: lwsync
@@ -3679,7 +3679,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: and 7, 7, 5
; AIX32-NEXT: or 7, 7, 8
; AIX32-NEXT: stwcx. 7, 0, 14
-; AIX32-NEXT: bne 0, L..BB2_43
+; AIX32-NEXT: bne- 0, L..BB2_43
; AIX32-NEXT: # %bb.44: # %entry
; AIX32-NEXT: srw 4, 6, 15
; AIX32-NEXT: lwsync
@@ -3693,7 +3693,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 4, 0, 13
; AIX32-NEXT: xor 4, 3, 4
; AIX32-NEXT: stwcx. 4, 0, 13
-; AIX32-NEXT: bne 0, L..BB2_45
+; AIX32-NEXT: bne- 0, L..BB2_45
; AIX32-NEXT: # %bb.46: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: stw 4, 0(13)
@@ -3704,7 +3704,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 4, 0, 25
; AIX32-NEXT: xor 4, 3, 4
; AIX32-NEXT: stwcx. 4, 0, 25
-; AIX32-NEXT: bne 0, L..BB2_47
+; AIX32-NEXT: bne- 0, L..BB2_47
; AIX32-NEXT: # %bb.48: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: stw 4, 0(25)
@@ -3741,7 +3741,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: and 7, 7, 5
; AIX32-NEXT: or 7, 7, 8
; AIX32-NEXT: stwcx. 7, 0, 22
-; AIX32-NEXT: bne 0, L..BB2_49
+; AIX32-NEXT: bne- 0, L..BB2_49
; AIX32-NEXT: # %bb.50: # %entry
; AIX32-NEXT: srw 4, 6, 24
; AIX32-NEXT: lwsync
@@ -3761,7 +3761,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: and 7, 7, 5
; AIX32-NEXT: or 7, 7, 8
; AIX32-NEXT: stwcx. 7, 0, 19
-; AIX32-NEXT: bne 0, L..BB2_51
+; AIX32-NEXT: bne- 0, L..BB2_51
; AIX32-NEXT: # %bb.52: # %entry
; AIX32-NEXT: srw 4, 6, 21
; AIX32-NEXT: li 5, 0
@@ -3782,7 +3782,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: and 7, 7, 5
; AIX32-NEXT: or 7, 7, 8
; AIX32-NEXT: stwcx. 7, 0, 16
-; AIX32-NEXT: bne 0, L..BB2_53
+; AIX32-NEXT: bne- 0, L..BB2_53
; AIX32-NEXT: # %bb.54: # %entry
; AIX32-NEXT: srw 4, 6, 18
; AIX32-NEXT: lwsync
@@ -3803,7 +3803,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: and 7, 7, 5
; AIX32-NEXT: or 7, 7, 8
; AIX32-NEXT: stwcx. 7, 0, 14
-; AIX32-NEXT: bne 0, L..BB2_55
+; AIX32-NEXT: bne- 0, L..BB2_55
; AIX32-NEXT: # %bb.56: # %entry
; AIX32-NEXT: srw 4, 6, 15
; AIX32-NEXT: lwsync
@@ -3817,7 +3817,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 4, 0, 13
; AIX32-NEXT: nand 4, 3, 4
; AIX32-NEXT: stwcx. 4, 0, 13
-; AIX32-NEXT: bne 0, L..BB2_57
+; AIX32-NEXT: bne- 0, L..BB2_57
; AIX32-NEXT: # %bb.58: # %entry
; AIX32-NEXT: stw 23, 56(1) # 4-byte Folded Spill
; AIX32-NEXT: stw 27, 60(1) # 4-byte Folded Spill
@@ -3830,7 +3830,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 4, 0, 25
; AIX32-NEXT: nand 4, 3, 4
; AIX32-NEXT: stwcx. 4, 0, 25
-; AIX32-NEXT: bne 0, L..BB2_59
+; AIX32-NEXT: bne- 0, L..BB2_59
; AIX32-NEXT: # %bb.60: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: stw 4, 0(25)
@@ -3951,7 +3951,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: and 7, 7, 5
; AIX32-NEXT: or 7, 7, 8
; AIX32-NEXT: stwcx. 7, 0, 22
-; AIX32-NEXT: bne 0, L..BB2_65
+; AIX32-NEXT: bne- 0, L..BB2_65
; AIX32-NEXT: # %bb.66: # %atomicrmw.end
; AIX32-NEXT: srw 4, 6, 24
; AIX32-NEXT: lwsync
@@ -3973,7 +3973,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: and 7, 7, 5
; AIX32-NEXT: or 7, 7, 8
; AIX32-NEXT: stwcx. 7, 0, 19
-; AIX32-NEXT: bne 0, L..BB2_67
+; AIX32-NEXT: bne- 0, L..BB2_67
; AIX32-NEXT: # %bb.68: # %atomicrmw.end
; AIX32-NEXT: srw 4, 6, 21
; AIX32-NEXT: li 5, 0
@@ -3993,7 +3993,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: and 7, 7, 5
; AIX32-NEXT: or 7, 7, 8
; AIX32-NEXT: stwcx. 7, 0, 16
-; AIX32-NEXT: bne 0, L..BB2_69
+; AIX32-NEXT: bne- 0, L..BB2_69
; AIX32-NEXT: # %bb.70: # %atomicrmw.end
; AIX32-NEXT: srw 4, 6, 18
; AIX32-NEXT: lwsync
@@ -4014,7 +4014,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: and 7, 7, 5
; AIX32-NEXT: or 7, 7, 8
; AIX32-NEXT: stwcx. 7, 0, 14
-; AIX32-NEXT: bne 0, L..BB2_71
+; AIX32-NEXT: bne- 0, L..BB2_71
; AIX32-NEXT: # %bb.72: # %atomicrmw.end
; AIX32-NEXT: srw 4, 6, 15
; AIX32-NEXT: lwsync
@@ -4028,7 +4028,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 4, 0, 13
; AIX32-NEXT: and 4, 3, 4
; AIX32-NEXT: stwcx. 4, 0, 13
-; AIX32-NEXT: bne 0, L..BB2_73
+; AIX32-NEXT: bne- 0, L..BB2_73
; AIX32-NEXT: # %bb.74: # %atomicrmw.end
; AIX32-NEXT: lwsync
; AIX32-NEXT: stw 4, 0(13)
@@ -4039,7 +4039,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
; AIX32-NEXT: lwarx 4, 0, 25
; AIX32-NEXT: and 4, 3, 4
; AIX32-NEXT: stwcx. 4, 0, 25
-; AIX32-NEXT: bne 0, L..BB2_75
+; AIX32-NEXT: bne- 0, L..BB2_75
; AIX32-NEXT: # %bb.76: # %atomicrmw.end
; AIX32-NEXT: lwsync
; AIX32-NEXT: stw 4, 0(25)
@@ -5371,7 +5371,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
; CHECK-NEXT: #
; CHECK-NEXT: lbarx 5, 0, 4
; CHECK-NEXT: stbcx. 7, 0, 4
-; CHECK-NEXT: bne 0, .LBB4_1
+; CHECK-NEXT: bne- 0, .LBB4_1
; CHECK-NEXT: # %bb.2: # %entry
; CHECK-NEXT: addis 4, 2, uc@toc@ha
; CHECK-NEXT: lwsync
@@ -5382,7 +5382,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
; CHECK-NEXT: #
; CHECK-NEXT: lbarx 5, 0, 6
; CHECK-NEXT: stbcx. 7, 0, 6
-; CHECK-NEXT: bne 0, .LBB4_3
+; CHECK-NEXT: bne- 0, .LBB4_3
; CHECK-NEXT: # %bb.4: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stb 5, uc@toc@l(4)
@@ -5393,7 +5393,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
; CHECK-NEXT: #
; CHECK-NEXT: lharx 6, 0, 8
; CHECK-NEXT: sthcx. 7, 0, 8
-; CHECK-NEXT: bne 0, .LBB4_5
+; CHECK-NEXT: bne- 0, .LBB4_5
; CHECK-NEXT: # %bb.6: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sth 6, ss@toc@l(5)
@@ -5404,7 +5404,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
; CHECK-NEXT: #
; CHECK-NEXT: lharx 8, 0, 9
; CHECK-NEXT: sthcx. 7, 0, 9
-; CHECK-NEXT: bne 0, .LBB4_7
+; CHECK-NEXT: bne- 0, .LBB4_7
; CHECK-NEXT: # %bb.8: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: sth 8, us@toc@l(6)
@@ -5415,7 +5415,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
; CHECK-NEXT: #
; CHECK-NEXT: lwarx 9, 0, 10
; CHECK-NEXT: stwcx. 7, 0, 10
-; CHECK-NEXT: bne 0, .LBB4_9
+; CHECK-NEXT: bne- 0, .LBB4_9
; CHECK-NEXT: # %bb.10: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stw 9, si@toc@l(8)
@@ -5426,7 +5426,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
; CHECK-NEXT: #
; CHECK-NEXT: lwarx 10, 0, 11
; CHECK-NEXT: stwcx. 7, 0, 11
-; CHECK-NEXT: bne 0, .LBB4_11
+; CHECK-NEXT: bne- 0, .LBB4_11
; CHECK-NEXT: # %bb.12: # %entry
; CHECK-NEXT: addis 7, 2, sll@toc@ha
; CHECK-NEXT: lwsync
@@ -5438,7 +5438,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
; CHECK-NEXT: #
; CHECK-NEXT: ldarx 12, 0, 10
; CHECK-NEXT: stdcx. 11, 0, 10
-; CHECK-NEXT: bne 0, .LBB4_13
+; CHECK-NEXT: bne- 0, .LBB4_13
; CHECK-NEXT: # %bb.14: # %entry
; CHECK-NEXT: addis 10, 2, ull@toc@ha
; CHECK-NEXT: lwsync
@@ -5449,7 +5449,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
; CHECK-NEXT: #
; CHECK-NEXT: ldarx 12, 0, 0
; CHECK-NEXT: stdcx. 11, 0, 0
-; CHECK-NEXT: bne 0, .LBB4_15
+; CHECK-NEXT: bne- 0, .LBB4_15
; CHECK-NEXT: # %bb.16: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: std 12, ull@toc@l(10)
@@ -5504,7 +5504,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
; AIX32-NEXT: andc 9, 8, 6
; AIX32-NEXT: or 9, 7, 9
; AIX32-NEXT: stwcx. 9, 0, 5
-; AIX32-NEXT: bne 0, L..BB4_1
+; AIX32-NEXT: bne- 0, L..BB4_1
; AIX32-NEXT: # %bb.2: # %entry
; AIX32-NEXT: srw 4, 8, 4
; AIX32-NEXT: lwz 28, L..C1(2) # @uc
@@ -5525,7 +5525,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
; AIX32-NEXT: andc 9, 8, 6
; AIX32-NEXT: or 9, 7, 9
; AIX32-NEXT: stwcx. 9, 0, 5
-; AIX32-NEXT: bne 0, L..BB4_3
+; AIX32-NEXT: bne- 0, L..BB4_3
; AIX32-NEXT: # %bb.4: # %entry
; AIX32-NEXT: srw 4, 8, 4
; AIX32-NEXT: lwz 27, L..C2(2) # @ss
@@ -5547,7 +5547,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
; AIX32-NEXT: andc 9, 8, 6
; AIX32-NEXT: or 9, 7, 9
; AIX32-NEXT: stwcx. 9, 0, 5
-; AIX32-NEXT: bne 0, L..BB4_5
+; AIX32-NEXT: bne- 0, L..BB4_5
; AIX32-NEXT: # %bb.6: # %entry
; AIX32-NEXT: srw 4, 8, 4
; AIX32-NEXT: lwz 26, L..C3(2) # @us
@@ -5569,7 +5569,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
; AIX32-NEXT: andc 9, 8, 6
; AIX32-NEXT: or 9, 7, 9
; AIX32-NEXT: stwcx. 9, 0, 5
-; AIX32-NEXT: bne 0, L..BB4_7
+; AIX32-NEXT: bne- 0, L..BB4_7
; AIX32-NEXT: # %bb.8: # %entry
; AIX32-NEXT: srw 4, 8, 4
; AIX32-NEXT: lwsync
@@ -5581,7 +5581,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
; AIX32-NEXT: #
; AIX32-NEXT: lwarx 4, 0, 25
; AIX32-NEXT: stwcx. 3, 0, 25
-; AIX32-NEXT: bne 0, L..BB4_9
+; AIX32-NEXT: bne- 0, L..BB4_9
; AIX32-NEXT: # %bb.10: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: stw 4, 0(25)
@@ -5591,7 +5591,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
; AIX32-NEXT: #
; AIX32-NEXT: lwarx 4, 0, 24
; AIX32-NEXT: stwcx. 3, 0, 24
-; AIX32-NEXT: bne 0, L..BB4_11
+; AIX32-NEXT: bne- 0, L..BB4_11
; AIX32-NEXT: # %bb.12: # %entry
; AIX32-NEXT: lwz 31, L..C6(2) # @sll
; AIX32-NEXT: lwsync
@@ -5695,7 +5695,7 @@ define dso_local void @test_atomic() local_unnamed_addr #0 {
; CHECK-NEXT: # %bb.2: # %entry
; CHECK-NEXT: #
; CHECK-NEXT: stwcx. 3, 0, 6
-; CHECK-NEXT: bne 0, .LBB5_1
+; CHECK-NEXT: bne- 0, .LBB5_1
; CHECK-NEXT: .LBB5_3: # %entry
; CHECK-NEXT: stw 5, ui@toc@l(4)
; CHECK-NEXT: addis 5, 2, si@toc@ha
@@ -5709,7 +5709,7 @@ define dso_local void @test_atomic() local_unnamed_addr #0 {
; CHECK-NEXT: # %bb.5: # %entry
; CHECK-NEXT: #
; CHECK-NEXT: stwcx. 3, 0, 7
-; CHECK-NEXT: bne 0, .LBB5_4
+; CHECK-NEXT: bne- 0, .LBB5_4
; CHECK-NEXT: .LBB5_6: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stw 8, si@toc@l(5)
@@ -5721,7 +5721,7 @@ define dso_local void @test_atomic() local_unnamed_addr #0 {
; CHECK-NEXT: # %bb.8: # %entry
; CHECK-NEXT: #
; CHECK-NEXT: stwcx. 3, 0, 6
-; CHECK-NEXT: bne 0, .LBB5_7
+; CHECK-NEXT: bne- 0, .LBB5_7
; CHECK-NEXT: .LBB5_9: # %entry
; CHECK-NEXT: lwsync
; CHECK-NEXT: stw 8, ui@toc@l(4)
@@ -5734,7 +5734,7 @@ define dso_local void @test_atomic() local_unnamed_addr #0 {
; CHECK-NEXT: # %bb.11: # %entry
; CHECK-NEXT: #
; CHECK-NEXT: stwcx. 3, 0, 7
-; CHECK-NEXT: bne 0, .LBB5_10
+; CHECK-NEXT: bne- 0, .LBB5_10
; CHECK-NEXT: .LBB5_12: # %entry
; CHECK-NEXT: stw 4, si@toc@l(5)
; CHECK-NEXT: blr
@@ -5751,7 +5751,7 @@ define dso_local void @test_atomic() local_unnamed_addr #0 {
; AIX32-NEXT: # %bb.2: # %entry
; AIX32-NEXT: #
; AIX32-NEXT: stwcx. 3, 0, 4
-; AIX32-NEXT: bne 0, L..BB5_1
+; AIX32-NEXT: bne- 0, L..BB5_1
; AIX32-NEXT: L..BB5_3: # %entry
; AIX32-NEXT: stw 5, 0(4)
; AIX32-NEXT: lwz 5, L..C4(2) # @si
@@ -5764,7 +5764,7 @@ define dso_local void @test_atomic() local_unnamed_addr #0 {
; AIX32-NEXT: # %bb.5: # %entry
; AIX32-NEXT: #
; AIX32-NEXT: stwcx. 3, 0, 5
-; AIX32-NEXT: bne 0, L..BB5_4
+; AIX32-NEXT: bne- 0, L..BB5_4
; AIX32-NEXT: L..BB5_6: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: stw 6, 0(5)
@@ -5776,7 +5776,7 @@ define dso_local void @test_atomic() local_unnamed_addr #0 {
; AIX32-NEXT: # %bb.8: # %entry
; AIX32-NEXT: #
; AIX32-NEXT: stwcx. 3, 0, 4
-; AIX32-NEXT: bne 0, L..BB5_7
+; AIX32-NEXT: bne- 0, L..BB5_7
; AIX32-NEXT: L..BB5_9: # %entry
; AIX32-NEXT: lwsync
; AIX32-NEXT: stw 6, 0(4)
@@ -5789,7 +5789,7 @@ define dso_local void @test_atomic() local_unnamed_addr #0 {
; AIX32-NEXT: # %bb.11: # %entry
; AIX32-NEXT: #
; AIX32-NEXT: stwcx. 3, 0, 5
-; AIX32-NEXT: bne 0, L..BB5_10
+; AIX32-NEXT: bne- 0, L..BB5_10
; AIX32-NEXT: L..BB5_12: # %entry
; AIX32-NEXT: stw 4, 0(5)
; AIX32-NEXT: blr
@@ -5870,7 +5870,7 @@ define dso_local i64 @atommax8(ptr nocapture noundef %ptr, i64 noundef %val) loc
; CHECK-NEXT: # %bb.2: # %entry
; CHECK-NEXT: #
; CHECK-NEXT: stdcx. 4, 0, 3
-; CHECK-NEXT: bne 0, .LBB7_1
+; CHECK-NEXT: bne- 0, .LBB7_1
; CHECK-NEXT: .LBB7_3: # %entry
; CHECK-NEXT: li 3, 55
; CHECK-NEXT: li 4, 66
@@ -5954,7 +5954,7 @@ define dso_local signext i32 @atommax4(ptr nocapture noundef %ptr, i32 noundef s
; CHECK-NEXT: # %bb.2: # %entry
; CHECK-NEXT: #
; CHECK-NEXT: stwcx. 4, 0, 3
-; CHECK-NEXT: bne 0, .LBB8_1
+; CHECK-NEXT: bne- 0, .LBB8_1
; CHECK-NEXT: .LBB8_3: # %entry
; CHECK-NEXT: li 3, 55
; CHECK-NEXT: li 4, 66
@@ -5973,7 +5973,7 @@ define dso_local signext i32 @atommax4(ptr nocapture noundef %ptr, i32 noundef s
; AIX32-NEXT: # %bb.2: # %entry
; AIX32-NEXT: #
; AIX32-NEXT: stwcx. 4, 0, 3
-; AIX32-NEXT: bne 0, L..BB8_1
+; AIX32-NEXT: bne- 0, L..BB8_1
; AIX32-NEXT: L..BB8_3: # %entry
; AIX32-NEXT: li 3, 55
; AIX32-NEXT: li 4, 66
@@ -6000,7 +6000,7 @@ define dso_local signext i16 @atommax2(ptr nocapture noundef %ptr, i16 noundef s
; CHECK-NEXT: # %bb.2: # %entry
; CHECK-NEXT: #
; CHECK-NEXT: sthcx. 4, 0, 3
-; CHECK-NEXT: bne 0, .LBB9_1
+; CHECK-NEXT: bne- 0, .LBB9_1
; CHECK-NEXT: .LBB9_3: # %entry
; CHECK-NEXT: li 3, 55
; CHECK-NEXT: li 4, 66
@@ -6033,7 +6033,7 @@ define dso_local signext i16 @atommax2(ptr nocapture noundef %ptr, i16 noundef s
; AIX32-NEXT: andc 10, 9, 7
; AIX32-NEXT: or 10, 8, 10
; AIX32-NEXT: stwcx. 10, 0, 3
-; AIX32-NEXT: bne 0, L..BB9_1
+; AIX32-NEXT: bne- 0, L..BB9_1
; AIX32-NEXT: L..BB9_3: # %entry
; AIX32-NEXT: srw 3, 9, 6
; AIX32-NEXT: lwsync
@@ -6063,7 +6063,7 @@ define dso_local zeroext i8 @atommax1(ptr nocapture noundef %ptr, i8 noundef zer
; CHECK-NEXT: # %bb.2: # %entry
; CHECK-NEXT: #
; CHECK-NEXT: stbcx. 4, 0, 3
-; CHECK-NEXT: bne 0, .LBB10_1
+; CHECK-NEXT: bne- 0, .LBB10_1
; CHECK-NEXT: .LBB10_3: # %entry
; CHECK-NEXT: li 3, 55
; CHECK-NEXT: li 4, 66
@@ -6092,7 +6092,7 @@ define dso_local zeroext i8 @atommax1(ptr nocapture noundef %ptr, i8 noundef zer
; AIX32-NEXT: andc 10, 9, 7
; AIX32-NEXT: or 10, 8, 10
; AIX32-NEXT: stwcx. 10, 0, 3
-; AIX32-NEXT: bne 0, L..BB10_1
+; AIX32-NEXT: bne- 0, L..BB10_1
; AIX32-NEXT: L..BB10_3: # %entry
; AIX32-NEXT: srw 3, 9, 5
; AIX32-NEXT: lwsync
diff --git a/llvm/test/CodeGen/PowerPC/atomic-minmax.ll b/llvm/test/CodeGen/PowerPC/atomic-minmax.ll
index 747d9e5..44a4f16 100644
--- a/llvm/test/CodeGen/PowerPC/atomic-minmax.ll
+++ b/llvm/test/CodeGen/PowerPC/atomic-minmax.ll
@@ -14,7 +14,7 @@ define void @a32min(ptr nocapture dereferenceable(4) %minimum, i32 %val) #0 {
; CHECK-NEXT: # %bb.2: # %entry
; CHECK-NEXT: #
; CHECK-NEXT: stwcx. 4, 0, 3
-; CHECK-NEXT: bne 0, .LBB0_1
+; CHECK-NEXT: bne- 0, .LBB0_1
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: blr
entry:
@@ -34,7 +34,7 @@ define void @a32max(ptr nocapture dereferenceable(4) %minimum, i32 %val) #0 {
; CHECK-NEXT: # %bb.2: # %entry
; CHECK-NEXT: #
; CHECK-NEXT: stwcx. 4, 0, 3
-; CHECK-NEXT: bne 0, .LBB1_1
+; CHECK-NEXT: bne- 0, .LBB1_1
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: blr
entry:
@@ -54,7 +54,7 @@ define void @a32umin(ptr nocapture dereferenceable(4) %minimum, i32 %val) #0 {
; CHECK-NEXT: # %bb.2: # %entry
; CHECK-NEXT: #
; CHECK-NEXT: stwcx. 4, 0, 3
-; CHECK-NEXT: bne 0, .LBB2_1
+; CHECK-NEXT: bne- 0, .LBB2_1
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: blr
entry:
@@ -74,7 +74,7 @@ define void @a32umax(ptr nocapture dereferenceable(4) %minimum, i32 %val) #0 {
; CHECK-NEXT: # %bb.2: # %entry
; CHECK-NEXT: #
; CHECK-NEXT: stwcx. 4, 0, 3
-; CHECK-NEXT: bne 0, .LBB3_1
+; CHECK-NEXT: bne- 0, .LBB3_1
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: blr
entry:
@@ -96,7 +96,7 @@ define void @a16min(ptr nocapture dereferenceable(4) %minimum, i16 %val) #1 {
; CHECK-NEXT: # %bb.2: # %entry
; CHECK-NEXT: #
; CHECK-NEXT: sthcx. 4, 0, 3
-; CHECK-NEXT: bne 0, .LBB4_1
+; CHECK-NEXT: bne- 0, .LBB4_1
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: blr
entry:
@@ -118,7 +118,7 @@ define void @a16max(ptr nocapture dereferenceable(4) %minimum, i16 %val) #1 {
; CHECK-NEXT: # %bb.2: # %entry
; CHECK-NEXT: #
; CHECK-NEXT: sthcx. 4, 0, 3
-; CHECK-NEXT: bne 0, .LBB5_1
+; CHECK-NEXT: bne- 0, .LBB5_1
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: blr
entry:
@@ -138,7 +138,7 @@ define void @a16umin(ptr nocapture dereferenceable(4) %minimum, i16 %val) #1 {
; CHECK-NEXT: # %bb.2: # %entry
; CHECK-NEXT: #
; CHECK-NEXT: sthcx. 4, 0, 3
-; CHECK-NEXT: bne 0, .LBB6_1
+; CHECK-NEXT: bne- 0, .LBB6_1
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: blr
entry:
@@ -158,7 +158,7 @@ define void @a16umax(ptr nocapture dereferenceable(4) %minimum, i16 %val) #1 {
; CHECK-NEXT: # %bb.2: # %entry
; CHECK-NEXT: #
; CHECK-NEXT: sthcx. 4, 0, 3
-; CHECK-NEXT: bne 0, .LBB7_1
+; CHECK-NEXT: bne- 0, .LBB7_1
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: blr
entry:
@@ -180,7 +180,7 @@ define void @a8min(ptr nocapture dereferenceable(4) %minimum, i8 %val) #1 {
; CHECK-NEXT: # %bb.2: # %entry
; CHECK-NEXT: #
; CHECK-NEXT: stbcx. 4, 0, 3
-; CHECK-NEXT: bne 0, .LBB8_1
+; CHECK-NEXT: bne- 0, .LBB8_1
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: blr
entry:
@@ -202,7 +202,7 @@ define void @a8max(ptr nocapture dereferenceable(4) %minimum, i8 %val) #1 {
; CHECK-NEXT: # %bb.2: # %entry
; CHECK-NEXT: #
; CHECK-NEXT: stbcx. 4, 0, 3
-; CHECK-NEXT: bne 0, .LBB9_1
+; CHECK-NEXT: bne- 0, .LBB9_1
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: blr
entry:
@@ -222,7 +222,7 @@ define void @a8umin(ptr nocapture dereferenceable(4) %minimum, i8 %val) #1 {
; CHECK-NEXT: # %bb.2: # %entry
; CHECK-NEXT: #
; CHECK-NEXT: stbcx. 4, 0, 3
-; CHECK-NEXT: bne 0, .LBB10_1
+; CHECK-NEXT: bne- 0, .LBB10_1
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: blr
entry:
@@ -242,7 +242,7 @@ define void @a8umax(ptr nocapture dereferenceable(4) %minimum, i8 %val) #1 {
; CHECK-NEXT: # %bb.2: # %entry
; CHECK-NEXT: #
; CHECK-NEXT: stbcx. 4, 0, 3
-; CHECK-NEXT: bne 0, .LBB11_1
+; CHECK-NEXT: bne- 0, .LBB11_1
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: blr
entry:
@@ -262,7 +262,7 @@ define void @a64min(ptr nocapture dereferenceable(4) %minimum, i64 %val) #0 {
; CHECK-NEXT: # %bb.2: # %entry
; CHECK-NEXT: #
; CHECK-NEXT: stdcx. 4, 0, 3
-; CHECK-NEXT: bne 0, .LBB12_1
+; CHECK-NEXT: bne- 0, .LBB12_1
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: blr
entry:
@@ -282,7 +282,7 @@ define void @a64max(ptr nocapture dereferenceable(4) %minimum, i64 %val) #0 {
; CHECK-NEXT: # %bb.2: # %entry
; CHECK-NEXT: #
; CHECK-NEXT: stdcx. 4, 0, 3
-; CHECK-NEXT: bne 0, .LBB13_1
+; CHECK-NEXT: bne- 0, .LBB13_1
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: blr
entry:
@@ -302,7 +302,7 @@ define void @a64umin(ptr nocapture dereferenceable(4) %minimum, i64 %val) #0 {
; CHECK-NEXT: # %bb.2: # %entry
; CHECK-NEXT: #
; CHECK-NEXT: stdcx. 4, 0, 3
-; CHECK-NEXT: bne 0, .LBB14_1
+; CHECK-NEXT: bne- 0, .LBB14_1
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: blr
entry:
@@ -322,7 +322,7 @@ define void @a64umax(ptr nocapture dereferenceable(4) %minimum, i64 %val) #0 {
; CHECK-NEXT: # %bb.2: # %entry
; CHECK-NEXT: #
; CHECK-NEXT: stdcx. 4, 0, 3
-; CHECK-NEXT: bne 0, .LBB15_1
+; CHECK-NEXT: bne- 0, .LBB15_1
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: blr
entry:
@@ -356,7 +356,7 @@ define void @ae16min(ptr nocapture dereferenceable(4) %minimum, i16 %val) #0 {
; CHECK-NEXT: andc 8, 8, 6
; CHECK-NEXT: or 8, 7, 8
; CHECK-NEXT: stwcx. 8, 0, 3
-; CHECK-NEXT: bne 0, .LBB16_1
+; CHECK-NEXT: bne- 0, .LBB16_1
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: blr
entry:
@@ -390,7 +390,7 @@ define void @ae16max(ptr nocapture dereferenceable(4) %minimum, i16 %val) #0 {
; CHECK-NEXT: andc 8, 8, 6
; CHECK-NEXT: or 8, 7, 8
; CHECK-NEXT: stwcx. 8, 0, 3
-; CHECK-NEXT: bne 0, .LBB17_1
+; CHECK-NEXT: bne- 0, .LBB17_1
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: blr
entry:
@@ -421,7 +421,7 @@ define void @ae16umin(ptr nocapture dereferenceable(4) %minimum, i16 %val) #0 {
; CHECK-NEXT: andc 7, 7, 5
; CHECK-NEXT: or 7, 6, 7
; CHECK-NEXT: stwcx. 7, 0, 3
-; CHECK-NEXT: bne 0, .LBB18_1
+; CHECK-NEXT: bne- 0, .LBB18_1
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: blr
entry:
@@ -452,7 +452,7 @@ define void @ae16umax(ptr nocapture dereferenceable(4) %minimum, i16 %val) #0 {
; CHECK-NEXT: andc 7, 7, 5
; CHECK-NEXT: or 7, 6, 7
; CHECK-NEXT: stwcx. 7, 0, 3
-; CHECK-NEXT: bne 0, .LBB19_1
+; CHECK-NEXT: bne- 0, .LBB19_1
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: blr
entry:
@@ -485,7 +485,7 @@ define void @ae8min(ptr nocapture dereferenceable(4) %minimum, i8 %val) #0 {
; CHECK-NEXT: andc 8, 8, 6
; CHECK-NEXT: or 8, 7, 8
; CHECK-NEXT: stwcx. 8, 0, 3
-; CHECK-NEXT: bne 0, .LBB20_1
+; CHECK-NEXT: bne- 0, .LBB20_1
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: blr
entry:
@@ -518,7 +518,7 @@ define void @ae8max(ptr nocapture dereferenceable(4) %minimum, i8 %val) #0 {
; CHECK-NEXT: andc 8, 8, 6
; CHECK-NEXT: or 8, 7, 8
; CHECK-NEXT: stwcx. 8, 0, 3
-; CHECK-NEXT: bne 0, .LBB21_1
+; CHECK-NEXT: bne- 0, .LBB21_1
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: blr
entry:
@@ -548,7 +548,7 @@ define void @ae8umin(ptr nocapture dereferenceable(4) %minimum, i8 %val) #0 {
; CHECK-NEXT: andc 7, 7, 5
; CHECK-NEXT: or 7, 6, 7
; CHECK-NEXT: stwcx. 7, 0, 3
-; CHECK-NEXT: bne 0, .LBB22_1
+; CHECK-NEXT: bne- 0, .LBB22_1
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: blr
entry:
@@ -578,7 +578,7 @@ define void @ae8umax(ptr nocapture dereferenceable(4) %minimum, i8 %val) #0 {
; CHECK-NEXT: andc 7, 7, 5
; CHECK-NEXT: or 7, 6, 7
; CHECK-NEXT: stwcx. 7, 0, 3
-; CHECK-NEXT: bne 0, .LBB23_1
+; CHECK-NEXT: bne- 0, .LBB23_1
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: blr
entry:
diff --git a/llvm/test/CodeGen/PowerPC/atomics-regression.ll b/llvm/test/CodeGen/PowerPC/atomics-regression.ll
index 90990bb..cfc3a99 100644
--- a/llvm/test/CodeGen/PowerPC/atomics-regression.ll
+++ b/llvm/test/CodeGen/PowerPC/atomics-regression.ll
@@ -2291,7 +2291,7 @@ define i8 @test120(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB120_1:
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: stbcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB120_1
+; PPC64LE-NEXT: bne- 0, .LBB120_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -2306,7 +2306,7 @@ define i8 @test121(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB121_1:
; PPC64LE-NEXT: lbarx 3, 0, 5
; PPC64LE-NEXT: stbcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB121_1
+; PPC64LE-NEXT: bne- 0, .LBB121_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -2321,7 +2321,7 @@ define i8 @test122(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB122_1:
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: stbcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB122_1
+; PPC64LE-NEXT: bne- 0, .LBB122_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -2336,7 +2336,7 @@ define i8 @test123(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB123_1:
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: stbcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB123_1
+; PPC64LE-NEXT: bne- 0, .LBB123_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -2352,7 +2352,7 @@ define i8 @test124(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB124_1:
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: stbcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB124_1
+; PPC64LE-NEXT: bne- 0, .LBB124_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -2367,7 +2367,7 @@ define i16 @test125(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB125_1:
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: sthcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB125_1
+; PPC64LE-NEXT: bne- 0, .LBB125_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -2382,7 +2382,7 @@ define i16 @test126(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB126_1:
; PPC64LE-NEXT: lharx 3, 0, 5
; PPC64LE-NEXT: sthcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB126_1
+; PPC64LE-NEXT: bne- 0, .LBB126_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -2397,7 +2397,7 @@ define i16 @test127(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB127_1:
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: sthcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB127_1
+; PPC64LE-NEXT: bne- 0, .LBB127_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -2412,7 +2412,7 @@ define i16 @test128(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB128_1:
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: sthcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB128_1
+; PPC64LE-NEXT: bne- 0, .LBB128_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -2428,7 +2428,7 @@ define i16 @test129(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB129_1:
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: sthcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB129_1
+; PPC64LE-NEXT: bne- 0, .LBB129_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -2443,7 +2443,7 @@ define i32 @test130(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB130_1:
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB130_1
+; PPC64LE-NEXT: bne- 0, .LBB130_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -2458,7 +2458,7 @@ define i32 @test131(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB131_1:
; PPC64LE-NEXT: lwarx 3, 0, 5
; PPC64LE-NEXT: stwcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB131_1
+; PPC64LE-NEXT: bne- 0, .LBB131_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -2473,7 +2473,7 @@ define i32 @test132(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB132_1:
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB132_1
+; PPC64LE-NEXT: bne- 0, .LBB132_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -2488,7 +2488,7 @@ define i32 @test133(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB133_1:
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB133_1
+; PPC64LE-NEXT: bne- 0, .LBB133_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -2504,7 +2504,7 @@ define i32 @test134(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB134_1:
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB134_1
+; PPC64LE-NEXT: bne- 0, .LBB134_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -2519,7 +2519,7 @@ define i64 @test135(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB135_1:
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB135_1
+; PPC64LE-NEXT: bne- 0, .LBB135_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -2534,7 +2534,7 @@ define i64 @test136(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB136_1:
; PPC64LE-NEXT: ldarx 3, 0, 5
; PPC64LE-NEXT: stdcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB136_1
+; PPC64LE-NEXT: bne- 0, .LBB136_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -2549,7 +2549,7 @@ define i64 @test137(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB137_1:
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB137_1
+; PPC64LE-NEXT: bne- 0, .LBB137_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -2564,7 +2564,7 @@ define i64 @test138(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB138_1:
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB138_1
+; PPC64LE-NEXT: bne- 0, .LBB138_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -2580,7 +2580,7 @@ define i64 @test139(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB139_1:
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB139_1
+; PPC64LE-NEXT: bne- 0, .LBB139_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -2596,7 +2596,7 @@ define i8 @test140(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB140_1
+; PPC64LE-NEXT: bne- 0, .LBB140_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -2612,7 +2612,7 @@ define i8 @test141(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 3, 0, 5
; PPC64LE-NEXT: add 6, 4, 3
; PPC64LE-NEXT: stbcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB141_1
+; PPC64LE-NEXT: bne- 0, .LBB141_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -2628,7 +2628,7 @@ define i8 @test142(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB142_1
+; PPC64LE-NEXT: bne- 0, .LBB142_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -2644,7 +2644,7 @@ define i8 @test143(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB143_1
+; PPC64LE-NEXT: bne- 0, .LBB143_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -2661,7 +2661,7 @@ define i8 @test144(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB144_1
+; PPC64LE-NEXT: bne- 0, .LBB144_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -2677,7 +2677,7 @@ define i16 @test145(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB145_1
+; PPC64LE-NEXT: bne- 0, .LBB145_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -2693,7 +2693,7 @@ define i16 @test146(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 3, 0, 5
; PPC64LE-NEXT: add 6, 4, 3
; PPC64LE-NEXT: sthcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB146_1
+; PPC64LE-NEXT: bne- 0, .LBB146_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -2709,7 +2709,7 @@ define i16 @test147(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB147_1
+; PPC64LE-NEXT: bne- 0, .LBB147_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -2725,7 +2725,7 @@ define i16 @test148(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB148_1
+; PPC64LE-NEXT: bne- 0, .LBB148_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -2742,7 +2742,7 @@ define i16 @test149(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB149_1
+; PPC64LE-NEXT: bne- 0, .LBB149_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -2758,7 +2758,7 @@ define i32 @test150(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB150_1
+; PPC64LE-NEXT: bne- 0, .LBB150_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -2774,7 +2774,7 @@ define i32 @test151(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 3, 0, 5
; PPC64LE-NEXT: add 6, 4, 3
; PPC64LE-NEXT: stwcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB151_1
+; PPC64LE-NEXT: bne- 0, .LBB151_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -2790,7 +2790,7 @@ define i32 @test152(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB152_1
+; PPC64LE-NEXT: bne- 0, .LBB152_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -2806,7 +2806,7 @@ define i32 @test153(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB153_1
+; PPC64LE-NEXT: bne- 0, .LBB153_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -2823,7 +2823,7 @@ define i32 @test154(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB154_1
+; PPC64LE-NEXT: bne- 0, .LBB154_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -2839,7 +2839,7 @@ define i64 @test155(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB155_1
+; PPC64LE-NEXT: bne- 0, .LBB155_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -2855,7 +2855,7 @@ define i64 @test156(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 3, 0, 5
; PPC64LE-NEXT: add 6, 4, 3
; PPC64LE-NEXT: stdcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB156_1
+; PPC64LE-NEXT: bne- 0, .LBB156_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -2871,7 +2871,7 @@ define i64 @test157(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB157_1
+; PPC64LE-NEXT: bne- 0, .LBB157_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -2887,7 +2887,7 @@ define i64 @test158(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB158_1
+; PPC64LE-NEXT: bne- 0, .LBB158_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -2904,7 +2904,7 @@ define i64 @test159(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB159_1
+; PPC64LE-NEXT: bne- 0, .LBB159_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -2920,7 +2920,7 @@ define i8 @test160(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB160_1
+; PPC64LE-NEXT: bne- 0, .LBB160_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -2936,7 +2936,7 @@ define i8 @test161(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 3, 0, 5
; PPC64LE-NEXT: sub 6, 3, 4
; PPC64LE-NEXT: stbcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB161_1
+; PPC64LE-NEXT: bne- 0, .LBB161_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -2952,7 +2952,7 @@ define i8 @test162(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB162_1
+; PPC64LE-NEXT: bne- 0, .LBB162_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -2968,7 +2968,7 @@ define i8 @test163(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB163_1
+; PPC64LE-NEXT: bne- 0, .LBB163_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -2985,7 +2985,7 @@ define i8 @test164(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB164_1
+; PPC64LE-NEXT: bne- 0, .LBB164_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -3001,7 +3001,7 @@ define i16 @test165(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB165_1
+; PPC64LE-NEXT: bne- 0, .LBB165_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -3017,7 +3017,7 @@ define i16 @test166(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 3, 0, 5
; PPC64LE-NEXT: sub 6, 3, 4
; PPC64LE-NEXT: sthcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB166_1
+; PPC64LE-NEXT: bne- 0, .LBB166_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -3033,7 +3033,7 @@ define i16 @test167(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB167_1
+; PPC64LE-NEXT: bne- 0, .LBB167_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -3049,7 +3049,7 @@ define i16 @test168(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB168_1
+; PPC64LE-NEXT: bne- 0, .LBB168_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -3066,7 +3066,7 @@ define i16 @test169(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB169_1
+; PPC64LE-NEXT: bne- 0, .LBB169_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -3082,7 +3082,7 @@ define i32 @test170(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB170_1
+; PPC64LE-NEXT: bne- 0, .LBB170_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -3098,7 +3098,7 @@ define i32 @test171(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 3, 0, 5
; PPC64LE-NEXT: sub 6, 3, 4
; PPC64LE-NEXT: stwcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB171_1
+; PPC64LE-NEXT: bne- 0, .LBB171_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -3114,7 +3114,7 @@ define i32 @test172(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB172_1
+; PPC64LE-NEXT: bne- 0, .LBB172_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -3130,7 +3130,7 @@ define i32 @test173(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB173_1
+; PPC64LE-NEXT: bne- 0, .LBB173_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -3147,7 +3147,7 @@ define i32 @test174(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB174_1
+; PPC64LE-NEXT: bne- 0, .LBB174_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -3163,7 +3163,7 @@ define i64 @test175(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB175_1
+; PPC64LE-NEXT: bne- 0, .LBB175_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -3179,7 +3179,7 @@ define i64 @test176(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 3, 0, 5
; PPC64LE-NEXT: sub 6, 3, 4
; PPC64LE-NEXT: stdcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB176_1
+; PPC64LE-NEXT: bne- 0, .LBB176_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -3195,7 +3195,7 @@ define i64 @test177(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB177_1
+; PPC64LE-NEXT: bne- 0, .LBB177_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -3211,7 +3211,7 @@ define i64 @test178(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB178_1
+; PPC64LE-NEXT: bne- 0, .LBB178_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -3228,7 +3228,7 @@ define i64 @test179(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB179_1
+; PPC64LE-NEXT: bne- 0, .LBB179_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -3244,7 +3244,7 @@ define i8 @test180(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB180_1
+; PPC64LE-NEXT: bne- 0, .LBB180_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -3260,7 +3260,7 @@ define i8 @test181(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 3, 0, 5
; PPC64LE-NEXT: and 6, 4, 3
; PPC64LE-NEXT: stbcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB181_1
+; PPC64LE-NEXT: bne- 0, .LBB181_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -3276,7 +3276,7 @@ define i8 @test182(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB182_1
+; PPC64LE-NEXT: bne- 0, .LBB182_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -3292,7 +3292,7 @@ define i8 @test183(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB183_1
+; PPC64LE-NEXT: bne- 0, .LBB183_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -3309,7 +3309,7 @@ define i8 @test184(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB184_1
+; PPC64LE-NEXT: bne- 0, .LBB184_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -3325,7 +3325,7 @@ define i16 @test185(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB185_1
+; PPC64LE-NEXT: bne- 0, .LBB185_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -3341,7 +3341,7 @@ define i16 @test186(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 3, 0, 5
; PPC64LE-NEXT: and 6, 4, 3
; PPC64LE-NEXT: sthcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB186_1
+; PPC64LE-NEXT: bne- 0, .LBB186_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -3357,7 +3357,7 @@ define i16 @test187(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB187_1
+; PPC64LE-NEXT: bne- 0, .LBB187_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -3373,7 +3373,7 @@ define i16 @test188(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB188_1
+; PPC64LE-NEXT: bne- 0, .LBB188_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -3390,7 +3390,7 @@ define i16 @test189(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB189_1
+; PPC64LE-NEXT: bne- 0, .LBB189_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -3406,7 +3406,7 @@ define i32 @test190(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB190_1
+; PPC64LE-NEXT: bne- 0, .LBB190_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -3422,7 +3422,7 @@ define i32 @test191(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 3, 0, 5
; PPC64LE-NEXT: and 6, 4, 3
; PPC64LE-NEXT: stwcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB191_1
+; PPC64LE-NEXT: bne- 0, .LBB191_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -3438,7 +3438,7 @@ define i32 @test192(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB192_1
+; PPC64LE-NEXT: bne- 0, .LBB192_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -3454,7 +3454,7 @@ define i32 @test193(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB193_1
+; PPC64LE-NEXT: bne- 0, .LBB193_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -3471,7 +3471,7 @@ define i32 @test194(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB194_1
+; PPC64LE-NEXT: bne- 0, .LBB194_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -3487,7 +3487,7 @@ define i64 @test195(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB195_1
+; PPC64LE-NEXT: bne- 0, .LBB195_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -3503,7 +3503,7 @@ define i64 @test196(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 3, 0, 5
; PPC64LE-NEXT: and 6, 4, 3
; PPC64LE-NEXT: stdcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB196_1
+; PPC64LE-NEXT: bne- 0, .LBB196_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -3519,7 +3519,7 @@ define i64 @test197(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB197_1
+; PPC64LE-NEXT: bne- 0, .LBB197_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -3535,7 +3535,7 @@ define i64 @test198(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB198_1
+; PPC64LE-NEXT: bne- 0, .LBB198_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -3552,7 +3552,7 @@ define i64 @test199(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB199_1
+; PPC64LE-NEXT: bne- 0, .LBB199_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -3568,7 +3568,7 @@ define i8 @test200(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB200_1
+; PPC64LE-NEXT: bne- 0, .LBB200_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -3584,7 +3584,7 @@ define i8 @test201(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 3, 0, 5
; PPC64LE-NEXT: nand 6, 4, 3
; PPC64LE-NEXT: stbcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB201_1
+; PPC64LE-NEXT: bne- 0, .LBB201_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -3600,7 +3600,7 @@ define i8 @test202(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB202_1
+; PPC64LE-NEXT: bne- 0, .LBB202_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -3616,7 +3616,7 @@ define i8 @test203(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB203_1
+; PPC64LE-NEXT: bne- 0, .LBB203_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -3633,7 +3633,7 @@ define i8 @test204(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB204_1
+; PPC64LE-NEXT: bne- 0, .LBB204_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -3649,7 +3649,7 @@ define i16 @test205(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB205_1
+; PPC64LE-NEXT: bne- 0, .LBB205_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -3665,7 +3665,7 @@ define i16 @test206(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 3, 0, 5
; PPC64LE-NEXT: nand 6, 4, 3
; PPC64LE-NEXT: sthcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB206_1
+; PPC64LE-NEXT: bne- 0, .LBB206_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -3681,7 +3681,7 @@ define i16 @test207(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB207_1
+; PPC64LE-NEXT: bne- 0, .LBB207_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -3697,7 +3697,7 @@ define i16 @test208(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB208_1
+; PPC64LE-NEXT: bne- 0, .LBB208_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -3714,7 +3714,7 @@ define i16 @test209(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB209_1
+; PPC64LE-NEXT: bne- 0, .LBB209_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -3730,7 +3730,7 @@ define i32 @test210(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB210_1
+; PPC64LE-NEXT: bne- 0, .LBB210_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -3746,7 +3746,7 @@ define i32 @test211(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 3, 0, 5
; PPC64LE-NEXT: nand 6, 4, 3
; PPC64LE-NEXT: stwcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB211_1
+; PPC64LE-NEXT: bne- 0, .LBB211_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -3762,7 +3762,7 @@ define i32 @test212(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB212_1
+; PPC64LE-NEXT: bne- 0, .LBB212_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -3778,7 +3778,7 @@ define i32 @test213(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB213_1
+; PPC64LE-NEXT: bne- 0, .LBB213_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -3795,7 +3795,7 @@ define i32 @test214(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB214_1
+; PPC64LE-NEXT: bne- 0, .LBB214_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -3811,7 +3811,7 @@ define i64 @test215(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB215_1
+; PPC64LE-NEXT: bne- 0, .LBB215_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -3827,7 +3827,7 @@ define i64 @test216(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 3, 0, 5
; PPC64LE-NEXT: nand 6, 4, 3
; PPC64LE-NEXT: stdcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB216_1
+; PPC64LE-NEXT: bne- 0, .LBB216_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -3843,7 +3843,7 @@ define i64 @test217(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB217_1
+; PPC64LE-NEXT: bne- 0, .LBB217_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -3859,7 +3859,7 @@ define i64 @test218(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB218_1
+; PPC64LE-NEXT: bne- 0, .LBB218_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -3876,7 +3876,7 @@ define i64 @test219(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB219_1
+; PPC64LE-NEXT: bne- 0, .LBB219_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -3892,7 +3892,7 @@ define i8 @test220(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB220_1
+; PPC64LE-NEXT: bne- 0, .LBB220_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -3908,7 +3908,7 @@ define i8 @test221(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 3, 0, 5
; PPC64LE-NEXT: or 6, 4, 3
; PPC64LE-NEXT: stbcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB221_1
+; PPC64LE-NEXT: bne- 0, .LBB221_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -3924,7 +3924,7 @@ define i8 @test222(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB222_1
+; PPC64LE-NEXT: bne- 0, .LBB222_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -3940,7 +3940,7 @@ define i8 @test223(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB223_1
+; PPC64LE-NEXT: bne- 0, .LBB223_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -3957,7 +3957,7 @@ define i8 @test224(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB224_1
+; PPC64LE-NEXT: bne- 0, .LBB224_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -3973,7 +3973,7 @@ define i16 @test225(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB225_1
+; PPC64LE-NEXT: bne- 0, .LBB225_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -3989,7 +3989,7 @@ define i16 @test226(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 3, 0, 5
; PPC64LE-NEXT: or 6, 4, 3
; PPC64LE-NEXT: sthcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB226_1
+; PPC64LE-NEXT: bne- 0, .LBB226_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -4005,7 +4005,7 @@ define i16 @test227(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB227_1
+; PPC64LE-NEXT: bne- 0, .LBB227_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -4021,7 +4021,7 @@ define i16 @test228(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB228_1
+; PPC64LE-NEXT: bne- 0, .LBB228_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -4038,7 +4038,7 @@ define i16 @test229(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB229_1
+; PPC64LE-NEXT: bne- 0, .LBB229_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -4054,7 +4054,7 @@ define i32 @test230(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB230_1
+; PPC64LE-NEXT: bne- 0, .LBB230_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -4070,7 +4070,7 @@ define i32 @test231(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 3, 0, 5
; PPC64LE-NEXT: or 6, 4, 3
; PPC64LE-NEXT: stwcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB231_1
+; PPC64LE-NEXT: bne- 0, .LBB231_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -4086,7 +4086,7 @@ define i32 @test232(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB232_1
+; PPC64LE-NEXT: bne- 0, .LBB232_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -4102,7 +4102,7 @@ define i32 @test233(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB233_1
+; PPC64LE-NEXT: bne- 0, .LBB233_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -4119,7 +4119,7 @@ define i32 @test234(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB234_1
+; PPC64LE-NEXT: bne- 0, .LBB234_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -4135,7 +4135,7 @@ define i64 @test235(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB235_1
+; PPC64LE-NEXT: bne- 0, .LBB235_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -4151,7 +4151,7 @@ define i64 @test236(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 3, 0, 5
; PPC64LE-NEXT: or 6, 4, 3
; PPC64LE-NEXT: stdcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB236_1
+; PPC64LE-NEXT: bne- 0, .LBB236_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -4167,7 +4167,7 @@ define i64 @test237(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB237_1
+; PPC64LE-NEXT: bne- 0, .LBB237_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -4183,7 +4183,7 @@ define i64 @test238(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB238_1
+; PPC64LE-NEXT: bne- 0, .LBB238_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -4200,7 +4200,7 @@ define i64 @test239(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB239_1
+; PPC64LE-NEXT: bne- 0, .LBB239_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -4216,7 +4216,7 @@ define i8 @test240(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB240_1
+; PPC64LE-NEXT: bne- 0, .LBB240_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -4232,7 +4232,7 @@ define i8 @test241(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 3, 0, 5
; PPC64LE-NEXT: xor 6, 4, 3
; PPC64LE-NEXT: stbcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB241_1
+; PPC64LE-NEXT: bne- 0, .LBB241_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -4248,7 +4248,7 @@ define i8 @test242(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB242_1
+; PPC64LE-NEXT: bne- 0, .LBB242_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -4264,7 +4264,7 @@ define i8 @test243(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB243_1
+; PPC64LE-NEXT: bne- 0, .LBB243_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -4281,7 +4281,7 @@ define i8 @test244(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB244_1
+; PPC64LE-NEXT: bne- 0, .LBB244_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -4297,7 +4297,7 @@ define i16 @test245(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB245_1
+; PPC64LE-NEXT: bne- 0, .LBB245_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -4313,7 +4313,7 @@ define i16 @test246(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 3, 0, 5
; PPC64LE-NEXT: xor 6, 4, 3
; PPC64LE-NEXT: sthcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB246_1
+; PPC64LE-NEXT: bne- 0, .LBB246_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -4329,7 +4329,7 @@ define i16 @test247(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB247_1
+; PPC64LE-NEXT: bne- 0, .LBB247_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -4345,7 +4345,7 @@ define i16 @test248(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB248_1
+; PPC64LE-NEXT: bne- 0, .LBB248_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -4362,7 +4362,7 @@ define i16 @test249(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB249_1
+; PPC64LE-NEXT: bne- 0, .LBB249_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -4378,7 +4378,7 @@ define i32 @test250(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB250_1
+; PPC64LE-NEXT: bne- 0, .LBB250_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -4394,7 +4394,7 @@ define i32 @test251(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 3, 0, 5
; PPC64LE-NEXT: xor 6, 4, 3
; PPC64LE-NEXT: stwcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB251_1
+; PPC64LE-NEXT: bne- 0, .LBB251_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -4410,7 +4410,7 @@ define i32 @test252(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB252_1
+; PPC64LE-NEXT: bne- 0, .LBB252_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -4426,7 +4426,7 @@ define i32 @test253(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB253_1
+; PPC64LE-NEXT: bne- 0, .LBB253_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -4443,7 +4443,7 @@ define i32 @test254(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB254_1
+; PPC64LE-NEXT: bne- 0, .LBB254_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -4459,7 +4459,7 @@ define i64 @test255(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB255_1
+; PPC64LE-NEXT: bne- 0, .LBB255_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -4475,7 +4475,7 @@ define i64 @test256(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 3, 0, 5
; PPC64LE-NEXT: xor 6, 4, 3
; PPC64LE-NEXT: stdcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB256_1
+; PPC64LE-NEXT: bne- 0, .LBB256_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -4491,7 +4491,7 @@ define i64 @test257(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB257_1
+; PPC64LE-NEXT: bne- 0, .LBB257_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -4507,7 +4507,7 @@ define i64 @test258(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB258_1
+; PPC64LE-NEXT: bne- 0, .LBB258_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -4524,7 +4524,7 @@ define i64 @test259(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB259_1
+; PPC64LE-NEXT: bne- 0, .LBB259_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -4544,7 +4544,7 @@ define i8 @test260(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: bgt 0, .LBB260_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB260_1
+; PPC64LE-NEXT: bne- 0, .LBB260_1
; PPC64LE-NEXT: .LBB260_3:
; PPC64LE-NEXT: mr 3, 4
; PPC64LE-NEXT: blr
@@ -4563,7 +4563,7 @@ define i8 @test261(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: bgt 0, .LBB261_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB261_1
+; PPC64LE-NEXT: bne- 0, .LBB261_1
; PPC64LE-NEXT: .LBB261_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 4
@@ -4584,7 +4584,7 @@ define i8 @test262(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: bgt 0, .LBB262_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB262_1
+; PPC64LE-NEXT: bne- 0, .LBB262_1
; PPC64LE-NEXT: .LBB262_3:
; PPC64LE-NEXT: mr 3, 4
; PPC64LE-NEXT: blr
@@ -4604,7 +4604,7 @@ define i8 @test263(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: bgt 0, .LBB263_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB263_1
+; PPC64LE-NEXT: bne- 0, .LBB263_1
; PPC64LE-NEXT: .LBB263_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 4
@@ -4625,7 +4625,7 @@ define i8 @test264(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: bgt 0, .LBB264_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB264_1
+; PPC64LE-NEXT: bne- 0, .LBB264_1
; PPC64LE-NEXT: .LBB264_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 4
@@ -4645,7 +4645,7 @@ define i16 @test265(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: bgt 0, .LBB265_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB265_1
+; PPC64LE-NEXT: bne- 0, .LBB265_1
; PPC64LE-NEXT: .LBB265_3:
; PPC64LE-NEXT: mr 3, 4
; PPC64LE-NEXT: blr
@@ -4664,7 +4664,7 @@ define i16 @test266(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: bgt 0, .LBB266_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB266_1
+; PPC64LE-NEXT: bne- 0, .LBB266_1
; PPC64LE-NEXT: .LBB266_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 4
@@ -4685,7 +4685,7 @@ define i16 @test267(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: bgt 0, .LBB267_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB267_1
+; PPC64LE-NEXT: bne- 0, .LBB267_1
; PPC64LE-NEXT: .LBB267_3:
; PPC64LE-NEXT: mr 3, 4
; PPC64LE-NEXT: blr
@@ -4705,7 +4705,7 @@ define i16 @test268(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: bgt 0, .LBB268_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB268_1
+; PPC64LE-NEXT: bne- 0, .LBB268_1
; PPC64LE-NEXT: .LBB268_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 4
@@ -4726,7 +4726,7 @@ define i16 @test269(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: bgt 0, .LBB269_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB269_1
+; PPC64LE-NEXT: bne- 0, .LBB269_1
; PPC64LE-NEXT: .LBB269_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 4
@@ -4744,7 +4744,7 @@ define i32 @test270(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: bgt 0, .LBB270_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB270_1
+; PPC64LE-NEXT: bne- 0, .LBB270_1
; PPC64LE-NEXT: .LBB270_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -4762,7 +4762,7 @@ define i32 @test271(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: bgt 0, .LBB271_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB271_1
+; PPC64LE-NEXT: bne- 0, .LBB271_1
; PPC64LE-NEXT: .LBB271_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -4780,7 +4780,7 @@ define i32 @test272(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: bgt 0, .LBB272_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB272_1
+; PPC64LE-NEXT: bne- 0, .LBB272_1
; PPC64LE-NEXT: .LBB272_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -4798,7 +4798,7 @@ define i32 @test273(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: bgt 0, .LBB273_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB273_1
+; PPC64LE-NEXT: bne- 0, .LBB273_1
; PPC64LE-NEXT: .LBB273_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -4817,7 +4817,7 @@ define i32 @test274(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: bgt 0, .LBB274_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB274_1
+; PPC64LE-NEXT: bne- 0, .LBB274_1
; PPC64LE-NEXT: .LBB274_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -4835,7 +4835,7 @@ define i64 @test275(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: bgt 0, .LBB275_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB275_1
+; PPC64LE-NEXT: bne- 0, .LBB275_1
; PPC64LE-NEXT: .LBB275_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -4853,7 +4853,7 @@ define i64 @test276(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: bgt 0, .LBB276_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB276_1
+; PPC64LE-NEXT: bne- 0, .LBB276_1
; PPC64LE-NEXT: .LBB276_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -4871,7 +4871,7 @@ define i64 @test277(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: bgt 0, .LBB277_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB277_1
+; PPC64LE-NEXT: bne- 0, .LBB277_1
; PPC64LE-NEXT: .LBB277_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -4889,7 +4889,7 @@ define i64 @test278(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: bgt 0, .LBB278_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB278_1
+; PPC64LE-NEXT: bne- 0, .LBB278_1
; PPC64LE-NEXT: .LBB278_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -4908,7 +4908,7 @@ define i64 @test279(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: bgt 0, .LBB279_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB279_1
+; PPC64LE-NEXT: bne- 0, .LBB279_1
; PPC64LE-NEXT: .LBB279_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -4928,7 +4928,7 @@ define i8 @test280(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: blt 0, .LBB280_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB280_1
+; PPC64LE-NEXT: bne- 0, .LBB280_1
; PPC64LE-NEXT: .LBB280_3:
; PPC64LE-NEXT: mr 3, 4
; PPC64LE-NEXT: blr
@@ -4947,7 +4947,7 @@ define i8 @test281(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: blt 0, .LBB281_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB281_1
+; PPC64LE-NEXT: bne- 0, .LBB281_1
; PPC64LE-NEXT: .LBB281_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 4
@@ -4968,7 +4968,7 @@ define i8 @test282(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: blt 0, .LBB282_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB282_1
+; PPC64LE-NEXT: bne- 0, .LBB282_1
; PPC64LE-NEXT: .LBB282_3:
; PPC64LE-NEXT: mr 3, 4
; PPC64LE-NEXT: blr
@@ -4988,7 +4988,7 @@ define i8 @test283(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: blt 0, .LBB283_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB283_1
+; PPC64LE-NEXT: bne- 0, .LBB283_1
; PPC64LE-NEXT: .LBB283_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 4
@@ -5009,7 +5009,7 @@ define i8 @test284(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: blt 0, .LBB284_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB284_1
+; PPC64LE-NEXT: bne- 0, .LBB284_1
; PPC64LE-NEXT: .LBB284_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 4
@@ -5029,7 +5029,7 @@ define i16 @test285(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: blt 0, .LBB285_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB285_1
+; PPC64LE-NEXT: bne- 0, .LBB285_1
; PPC64LE-NEXT: .LBB285_3:
; PPC64LE-NEXT: mr 3, 4
; PPC64LE-NEXT: blr
@@ -5048,7 +5048,7 @@ define i16 @test286(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: blt 0, .LBB286_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB286_1
+; PPC64LE-NEXT: bne- 0, .LBB286_1
; PPC64LE-NEXT: .LBB286_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 4
@@ -5069,7 +5069,7 @@ define i16 @test287(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: blt 0, .LBB287_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB287_1
+; PPC64LE-NEXT: bne- 0, .LBB287_1
; PPC64LE-NEXT: .LBB287_3:
; PPC64LE-NEXT: mr 3, 4
; PPC64LE-NEXT: blr
@@ -5089,7 +5089,7 @@ define i16 @test288(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: blt 0, .LBB288_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB288_1
+; PPC64LE-NEXT: bne- 0, .LBB288_1
; PPC64LE-NEXT: .LBB288_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 4
@@ -5110,7 +5110,7 @@ define i16 @test289(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: blt 0, .LBB289_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB289_1
+; PPC64LE-NEXT: bne- 0, .LBB289_1
; PPC64LE-NEXT: .LBB289_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 4
@@ -5128,7 +5128,7 @@ define i32 @test290(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: blt 0, .LBB290_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB290_1
+; PPC64LE-NEXT: bne- 0, .LBB290_1
; PPC64LE-NEXT: .LBB290_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -5146,7 +5146,7 @@ define i32 @test291(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: blt 0, .LBB291_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB291_1
+; PPC64LE-NEXT: bne- 0, .LBB291_1
; PPC64LE-NEXT: .LBB291_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -5164,7 +5164,7 @@ define i32 @test292(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: blt 0, .LBB292_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB292_1
+; PPC64LE-NEXT: bne- 0, .LBB292_1
; PPC64LE-NEXT: .LBB292_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -5182,7 +5182,7 @@ define i32 @test293(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: blt 0, .LBB293_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB293_1
+; PPC64LE-NEXT: bne- 0, .LBB293_1
; PPC64LE-NEXT: .LBB293_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -5201,7 +5201,7 @@ define i32 @test294(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: blt 0, .LBB294_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB294_1
+; PPC64LE-NEXT: bne- 0, .LBB294_1
; PPC64LE-NEXT: .LBB294_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -5219,7 +5219,7 @@ define i64 @test295(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: blt 0, .LBB295_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB295_1
+; PPC64LE-NEXT: bne- 0, .LBB295_1
; PPC64LE-NEXT: .LBB295_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -5237,7 +5237,7 @@ define i64 @test296(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: blt 0, .LBB296_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB296_1
+; PPC64LE-NEXT: bne- 0, .LBB296_1
; PPC64LE-NEXT: .LBB296_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -5255,7 +5255,7 @@ define i64 @test297(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: blt 0, .LBB297_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB297_1
+; PPC64LE-NEXT: bne- 0, .LBB297_1
; PPC64LE-NEXT: .LBB297_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -5273,7 +5273,7 @@ define i64 @test298(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: blt 0, .LBB298_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB298_1
+; PPC64LE-NEXT: bne- 0, .LBB298_1
; PPC64LE-NEXT: .LBB298_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -5292,7 +5292,7 @@ define i64 @test299(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: blt 0, .LBB299_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB299_1
+; PPC64LE-NEXT: bne- 0, .LBB299_1
; PPC64LE-NEXT: .LBB299_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -5310,7 +5310,7 @@ define i8 @test300(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: bgt 0, .LBB300_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB300_1
+; PPC64LE-NEXT: bne- 0, .LBB300_1
; PPC64LE-NEXT: .LBB300_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -5328,7 +5328,7 @@ define i8 @test301(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: bgt 0, .LBB301_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB301_1
+; PPC64LE-NEXT: bne- 0, .LBB301_1
; PPC64LE-NEXT: .LBB301_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -5346,7 +5346,7 @@ define i8 @test302(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: bgt 0, .LBB302_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB302_1
+; PPC64LE-NEXT: bne- 0, .LBB302_1
; PPC64LE-NEXT: .LBB302_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -5364,7 +5364,7 @@ define i8 @test303(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: bgt 0, .LBB303_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB303_1
+; PPC64LE-NEXT: bne- 0, .LBB303_1
; PPC64LE-NEXT: .LBB303_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -5383,7 +5383,7 @@ define i8 @test304(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: bgt 0, .LBB304_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB304_1
+; PPC64LE-NEXT: bne- 0, .LBB304_1
; PPC64LE-NEXT: .LBB304_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -5401,7 +5401,7 @@ define i16 @test305(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: bgt 0, .LBB305_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB305_1
+; PPC64LE-NEXT: bne- 0, .LBB305_1
; PPC64LE-NEXT: .LBB305_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -5419,7 +5419,7 @@ define i16 @test306(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: bgt 0, .LBB306_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB306_1
+; PPC64LE-NEXT: bne- 0, .LBB306_1
; PPC64LE-NEXT: .LBB306_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -5437,7 +5437,7 @@ define i16 @test307(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: bgt 0, .LBB307_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB307_1
+; PPC64LE-NEXT: bne- 0, .LBB307_1
; PPC64LE-NEXT: .LBB307_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -5455,7 +5455,7 @@ define i16 @test308(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: bgt 0, .LBB308_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB308_1
+; PPC64LE-NEXT: bne- 0, .LBB308_1
; PPC64LE-NEXT: .LBB308_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -5474,7 +5474,7 @@ define i16 @test309(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: bgt 0, .LBB309_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB309_1
+; PPC64LE-NEXT: bne- 0, .LBB309_1
; PPC64LE-NEXT: .LBB309_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -5492,7 +5492,7 @@ define i32 @test310(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: bgt 0, .LBB310_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB310_1
+; PPC64LE-NEXT: bne- 0, .LBB310_1
; PPC64LE-NEXT: .LBB310_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -5510,7 +5510,7 @@ define i32 @test311(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: bgt 0, .LBB311_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB311_1
+; PPC64LE-NEXT: bne- 0, .LBB311_1
; PPC64LE-NEXT: .LBB311_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -5528,7 +5528,7 @@ define i32 @test312(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: bgt 0, .LBB312_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB312_1
+; PPC64LE-NEXT: bne- 0, .LBB312_1
; PPC64LE-NEXT: .LBB312_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -5546,7 +5546,7 @@ define i32 @test313(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: bgt 0, .LBB313_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB313_1
+; PPC64LE-NEXT: bne- 0, .LBB313_1
; PPC64LE-NEXT: .LBB313_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -5565,7 +5565,7 @@ define i32 @test314(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: bgt 0, .LBB314_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB314_1
+; PPC64LE-NEXT: bne- 0, .LBB314_1
; PPC64LE-NEXT: .LBB314_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -5583,7 +5583,7 @@ define i64 @test315(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: bgt 0, .LBB315_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB315_1
+; PPC64LE-NEXT: bne- 0, .LBB315_1
; PPC64LE-NEXT: .LBB315_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -5601,7 +5601,7 @@ define i64 @test316(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: bgt 0, .LBB316_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB316_1
+; PPC64LE-NEXT: bne- 0, .LBB316_1
; PPC64LE-NEXT: .LBB316_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -5619,7 +5619,7 @@ define i64 @test317(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: bgt 0, .LBB317_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB317_1
+; PPC64LE-NEXT: bne- 0, .LBB317_1
; PPC64LE-NEXT: .LBB317_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -5637,7 +5637,7 @@ define i64 @test318(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: bgt 0, .LBB318_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB318_1
+; PPC64LE-NEXT: bne- 0, .LBB318_1
; PPC64LE-NEXT: .LBB318_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -5656,7 +5656,7 @@ define i64 @test319(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: bgt 0, .LBB319_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB319_1
+; PPC64LE-NEXT: bne- 0, .LBB319_1
; PPC64LE-NEXT: .LBB319_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -5674,7 +5674,7 @@ define i8 @test320(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: blt 0, .LBB320_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB320_1
+; PPC64LE-NEXT: bne- 0, .LBB320_1
; PPC64LE-NEXT: .LBB320_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -5692,7 +5692,7 @@ define i8 @test321(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: blt 0, .LBB321_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB321_1
+; PPC64LE-NEXT: bne- 0, .LBB321_1
; PPC64LE-NEXT: .LBB321_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -5710,7 +5710,7 @@ define i8 @test322(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: blt 0, .LBB322_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB322_1
+; PPC64LE-NEXT: bne- 0, .LBB322_1
; PPC64LE-NEXT: .LBB322_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -5728,7 +5728,7 @@ define i8 @test323(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: blt 0, .LBB323_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB323_1
+; PPC64LE-NEXT: bne- 0, .LBB323_1
; PPC64LE-NEXT: .LBB323_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -5747,7 +5747,7 @@ define i8 @test324(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: blt 0, .LBB324_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB324_1
+; PPC64LE-NEXT: bne- 0, .LBB324_1
; PPC64LE-NEXT: .LBB324_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -5765,7 +5765,7 @@ define i16 @test325(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: blt 0, .LBB325_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB325_1
+; PPC64LE-NEXT: bne- 0, .LBB325_1
; PPC64LE-NEXT: .LBB325_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -5783,7 +5783,7 @@ define i16 @test326(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: blt 0, .LBB326_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB326_1
+; PPC64LE-NEXT: bne- 0, .LBB326_1
; PPC64LE-NEXT: .LBB326_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -5801,7 +5801,7 @@ define i16 @test327(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: blt 0, .LBB327_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB327_1
+; PPC64LE-NEXT: bne- 0, .LBB327_1
; PPC64LE-NEXT: .LBB327_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -5819,7 +5819,7 @@ define i16 @test328(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: blt 0, .LBB328_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB328_1
+; PPC64LE-NEXT: bne- 0, .LBB328_1
; PPC64LE-NEXT: .LBB328_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -5838,7 +5838,7 @@ define i16 @test329(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: blt 0, .LBB329_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB329_1
+; PPC64LE-NEXT: bne- 0, .LBB329_1
; PPC64LE-NEXT: .LBB329_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -5856,7 +5856,7 @@ define i32 @test330(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: blt 0, .LBB330_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB330_1
+; PPC64LE-NEXT: bne- 0, .LBB330_1
; PPC64LE-NEXT: .LBB330_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -5874,7 +5874,7 @@ define i32 @test331(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: blt 0, .LBB331_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB331_1
+; PPC64LE-NEXT: bne- 0, .LBB331_1
; PPC64LE-NEXT: .LBB331_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -5892,7 +5892,7 @@ define i32 @test332(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: blt 0, .LBB332_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB332_1
+; PPC64LE-NEXT: bne- 0, .LBB332_1
; PPC64LE-NEXT: .LBB332_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -5910,7 +5910,7 @@ define i32 @test333(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: blt 0, .LBB333_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB333_1
+; PPC64LE-NEXT: bne- 0, .LBB333_1
; PPC64LE-NEXT: .LBB333_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -5929,7 +5929,7 @@ define i32 @test334(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: blt 0, .LBB334_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB334_1
+; PPC64LE-NEXT: bne- 0, .LBB334_1
; PPC64LE-NEXT: .LBB334_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -5947,7 +5947,7 @@ define i64 @test335(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: blt 0, .LBB335_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB335_1
+; PPC64LE-NEXT: bne- 0, .LBB335_1
; PPC64LE-NEXT: .LBB335_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -5965,7 +5965,7 @@ define i64 @test336(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: blt 0, .LBB336_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB336_1
+; PPC64LE-NEXT: bne- 0, .LBB336_1
; PPC64LE-NEXT: .LBB336_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -5983,7 +5983,7 @@ define i64 @test337(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: blt 0, .LBB337_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB337_1
+; PPC64LE-NEXT: bne- 0, .LBB337_1
; PPC64LE-NEXT: .LBB337_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -6001,7 +6001,7 @@ define i64 @test338(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: blt 0, .LBB338_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB338_1
+; PPC64LE-NEXT: bne- 0, .LBB338_1
; PPC64LE-NEXT: .LBB338_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -6020,7 +6020,7 @@ define i64 @test339(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: blt 0, .LBB339_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB339_1
+; PPC64LE-NEXT: bne- 0, .LBB339_1
; PPC64LE-NEXT: .LBB339_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -6035,7 +6035,7 @@ define i8 @test340(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB340_1:
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: stbcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB340_1
+; PPC64LE-NEXT: bne- 0, .LBB340_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -6050,7 +6050,7 @@ define i8 @test341(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB341_1:
; PPC64LE-NEXT: lbarx 3, 0, 5
; PPC64LE-NEXT: stbcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB341_1
+; PPC64LE-NEXT: bne- 0, .LBB341_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -6065,7 +6065,7 @@ define i8 @test342(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB342_1:
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: stbcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB342_1
+; PPC64LE-NEXT: bne- 0, .LBB342_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -6080,7 +6080,7 @@ define i8 @test343(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB343_1:
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: stbcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB343_1
+; PPC64LE-NEXT: bne- 0, .LBB343_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -6096,7 +6096,7 @@ define i8 @test344(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB344_1:
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: stbcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB344_1
+; PPC64LE-NEXT: bne- 0, .LBB344_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -6111,7 +6111,7 @@ define i16 @test345(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB345_1:
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: sthcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB345_1
+; PPC64LE-NEXT: bne- 0, .LBB345_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -6126,7 +6126,7 @@ define i16 @test346(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB346_1:
; PPC64LE-NEXT: lharx 3, 0, 5
; PPC64LE-NEXT: sthcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB346_1
+; PPC64LE-NEXT: bne- 0, .LBB346_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -6141,7 +6141,7 @@ define i16 @test347(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB347_1:
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: sthcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB347_1
+; PPC64LE-NEXT: bne- 0, .LBB347_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -6156,7 +6156,7 @@ define i16 @test348(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB348_1:
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: sthcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB348_1
+; PPC64LE-NEXT: bne- 0, .LBB348_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -6172,7 +6172,7 @@ define i16 @test349(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB349_1:
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: sthcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB349_1
+; PPC64LE-NEXT: bne- 0, .LBB349_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -6187,7 +6187,7 @@ define i32 @test350(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB350_1:
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB350_1
+; PPC64LE-NEXT: bne- 0, .LBB350_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -6202,7 +6202,7 @@ define i32 @test351(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB351_1:
; PPC64LE-NEXT: lwarx 3, 0, 5
; PPC64LE-NEXT: stwcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB351_1
+; PPC64LE-NEXT: bne- 0, .LBB351_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -6217,7 +6217,7 @@ define i32 @test352(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB352_1:
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB352_1
+; PPC64LE-NEXT: bne- 0, .LBB352_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -6232,7 +6232,7 @@ define i32 @test353(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB353_1:
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB353_1
+; PPC64LE-NEXT: bne- 0, .LBB353_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -6248,7 +6248,7 @@ define i32 @test354(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB354_1:
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB354_1
+; PPC64LE-NEXT: bne- 0, .LBB354_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -6263,7 +6263,7 @@ define i64 @test355(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB355_1:
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB355_1
+; PPC64LE-NEXT: bne- 0, .LBB355_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -6278,7 +6278,7 @@ define i64 @test356(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB356_1:
; PPC64LE-NEXT: ldarx 3, 0, 5
; PPC64LE-NEXT: stdcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB356_1
+; PPC64LE-NEXT: bne- 0, .LBB356_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -6293,7 +6293,7 @@ define i64 @test357(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB357_1:
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB357_1
+; PPC64LE-NEXT: bne- 0, .LBB357_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -6308,7 +6308,7 @@ define i64 @test358(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB358_1:
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB358_1
+; PPC64LE-NEXT: bne- 0, .LBB358_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -6324,7 +6324,7 @@ define i64 @test359(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB359_1:
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB359_1
+; PPC64LE-NEXT: bne- 0, .LBB359_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -6340,7 +6340,7 @@ define i8 @test360(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB360_1
+; PPC64LE-NEXT: bne- 0, .LBB360_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -6356,7 +6356,7 @@ define i8 @test361(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 3, 0, 5
; PPC64LE-NEXT: add 6, 4, 3
; PPC64LE-NEXT: stbcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB361_1
+; PPC64LE-NEXT: bne- 0, .LBB361_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -6372,7 +6372,7 @@ define i8 @test362(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB362_1
+; PPC64LE-NEXT: bne- 0, .LBB362_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -6388,7 +6388,7 @@ define i8 @test363(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB363_1
+; PPC64LE-NEXT: bne- 0, .LBB363_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -6405,7 +6405,7 @@ define i8 @test364(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB364_1
+; PPC64LE-NEXT: bne- 0, .LBB364_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -6421,7 +6421,7 @@ define i16 @test365(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB365_1
+; PPC64LE-NEXT: bne- 0, .LBB365_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -6437,7 +6437,7 @@ define i16 @test366(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 3, 0, 5
; PPC64LE-NEXT: add 6, 4, 3
; PPC64LE-NEXT: sthcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB366_1
+; PPC64LE-NEXT: bne- 0, .LBB366_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -6453,7 +6453,7 @@ define i16 @test367(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB367_1
+; PPC64LE-NEXT: bne- 0, .LBB367_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -6469,7 +6469,7 @@ define i16 @test368(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB368_1
+; PPC64LE-NEXT: bne- 0, .LBB368_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -6486,7 +6486,7 @@ define i16 @test369(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB369_1
+; PPC64LE-NEXT: bne- 0, .LBB369_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -6502,7 +6502,7 @@ define i32 @test370(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB370_1
+; PPC64LE-NEXT: bne- 0, .LBB370_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -6518,7 +6518,7 @@ define i32 @test371(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 3, 0, 5
; PPC64LE-NEXT: add 6, 4, 3
; PPC64LE-NEXT: stwcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB371_1
+; PPC64LE-NEXT: bne- 0, .LBB371_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -6534,7 +6534,7 @@ define i32 @test372(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB372_1
+; PPC64LE-NEXT: bne- 0, .LBB372_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -6550,7 +6550,7 @@ define i32 @test373(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB373_1
+; PPC64LE-NEXT: bne- 0, .LBB373_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -6567,7 +6567,7 @@ define i32 @test374(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB374_1
+; PPC64LE-NEXT: bne- 0, .LBB374_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -6583,7 +6583,7 @@ define i64 @test375(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB375_1
+; PPC64LE-NEXT: bne- 0, .LBB375_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -6599,7 +6599,7 @@ define i64 @test376(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 3, 0, 5
; PPC64LE-NEXT: add 6, 4, 3
; PPC64LE-NEXT: stdcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB376_1
+; PPC64LE-NEXT: bne- 0, .LBB376_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -6615,7 +6615,7 @@ define i64 @test377(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB377_1
+; PPC64LE-NEXT: bne- 0, .LBB377_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -6631,7 +6631,7 @@ define i64 @test378(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB378_1
+; PPC64LE-NEXT: bne- 0, .LBB378_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -6648,7 +6648,7 @@ define i64 @test379(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: add 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB379_1
+; PPC64LE-NEXT: bne- 0, .LBB379_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -6664,7 +6664,7 @@ define i8 @test380(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB380_1
+; PPC64LE-NEXT: bne- 0, .LBB380_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -6680,7 +6680,7 @@ define i8 @test381(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 3, 0, 5
; PPC64LE-NEXT: sub 6, 3, 4
; PPC64LE-NEXT: stbcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB381_1
+; PPC64LE-NEXT: bne- 0, .LBB381_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -6696,7 +6696,7 @@ define i8 @test382(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB382_1
+; PPC64LE-NEXT: bne- 0, .LBB382_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -6712,7 +6712,7 @@ define i8 @test383(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB383_1
+; PPC64LE-NEXT: bne- 0, .LBB383_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -6729,7 +6729,7 @@ define i8 @test384(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB384_1
+; PPC64LE-NEXT: bne- 0, .LBB384_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -6745,7 +6745,7 @@ define i16 @test385(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB385_1
+; PPC64LE-NEXT: bne- 0, .LBB385_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -6761,7 +6761,7 @@ define i16 @test386(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 3, 0, 5
; PPC64LE-NEXT: sub 6, 3, 4
; PPC64LE-NEXT: sthcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB386_1
+; PPC64LE-NEXT: bne- 0, .LBB386_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -6777,7 +6777,7 @@ define i16 @test387(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB387_1
+; PPC64LE-NEXT: bne- 0, .LBB387_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -6793,7 +6793,7 @@ define i16 @test388(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB388_1
+; PPC64LE-NEXT: bne- 0, .LBB388_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -6810,7 +6810,7 @@ define i16 @test389(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB389_1
+; PPC64LE-NEXT: bne- 0, .LBB389_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -6826,7 +6826,7 @@ define i32 @test390(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB390_1
+; PPC64LE-NEXT: bne- 0, .LBB390_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -6842,7 +6842,7 @@ define i32 @test391(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 3, 0, 5
; PPC64LE-NEXT: sub 6, 3, 4
; PPC64LE-NEXT: stwcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB391_1
+; PPC64LE-NEXT: bne- 0, .LBB391_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -6858,7 +6858,7 @@ define i32 @test392(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB392_1
+; PPC64LE-NEXT: bne- 0, .LBB392_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -6874,7 +6874,7 @@ define i32 @test393(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB393_1
+; PPC64LE-NEXT: bne- 0, .LBB393_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -6891,7 +6891,7 @@ define i32 @test394(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB394_1
+; PPC64LE-NEXT: bne- 0, .LBB394_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -6907,7 +6907,7 @@ define i64 @test395(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB395_1
+; PPC64LE-NEXT: bne- 0, .LBB395_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -6923,7 +6923,7 @@ define i64 @test396(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 3, 0, 5
; PPC64LE-NEXT: sub 6, 3, 4
; PPC64LE-NEXT: stdcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB396_1
+; PPC64LE-NEXT: bne- 0, .LBB396_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -6939,7 +6939,7 @@ define i64 @test397(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB397_1
+; PPC64LE-NEXT: bne- 0, .LBB397_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -6955,7 +6955,7 @@ define i64 @test398(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB398_1
+; PPC64LE-NEXT: bne- 0, .LBB398_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -6972,7 +6972,7 @@ define i64 @test399(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: sub 6, 5, 4
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB399_1
+; PPC64LE-NEXT: bne- 0, .LBB399_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -6988,7 +6988,7 @@ define i8 @test400(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB400_1
+; PPC64LE-NEXT: bne- 0, .LBB400_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -7004,7 +7004,7 @@ define i8 @test401(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 3, 0, 5
; PPC64LE-NEXT: and 6, 4, 3
; PPC64LE-NEXT: stbcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB401_1
+; PPC64LE-NEXT: bne- 0, .LBB401_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -7020,7 +7020,7 @@ define i8 @test402(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB402_1
+; PPC64LE-NEXT: bne- 0, .LBB402_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -7036,7 +7036,7 @@ define i8 @test403(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB403_1
+; PPC64LE-NEXT: bne- 0, .LBB403_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -7053,7 +7053,7 @@ define i8 @test404(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB404_1
+; PPC64LE-NEXT: bne- 0, .LBB404_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -7069,7 +7069,7 @@ define i16 @test405(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB405_1
+; PPC64LE-NEXT: bne- 0, .LBB405_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -7085,7 +7085,7 @@ define i16 @test406(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 3, 0, 5
; PPC64LE-NEXT: and 6, 4, 3
; PPC64LE-NEXT: sthcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB406_1
+; PPC64LE-NEXT: bne- 0, .LBB406_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -7101,7 +7101,7 @@ define i16 @test407(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB407_1
+; PPC64LE-NEXT: bne- 0, .LBB407_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -7117,7 +7117,7 @@ define i16 @test408(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB408_1
+; PPC64LE-NEXT: bne- 0, .LBB408_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -7134,7 +7134,7 @@ define i16 @test409(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB409_1
+; PPC64LE-NEXT: bne- 0, .LBB409_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -7150,7 +7150,7 @@ define i32 @test410(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB410_1
+; PPC64LE-NEXT: bne- 0, .LBB410_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -7166,7 +7166,7 @@ define i32 @test411(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 3, 0, 5
; PPC64LE-NEXT: and 6, 4, 3
; PPC64LE-NEXT: stwcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB411_1
+; PPC64LE-NEXT: bne- 0, .LBB411_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -7182,7 +7182,7 @@ define i32 @test412(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB412_1
+; PPC64LE-NEXT: bne- 0, .LBB412_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -7198,7 +7198,7 @@ define i32 @test413(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB413_1
+; PPC64LE-NEXT: bne- 0, .LBB413_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -7215,7 +7215,7 @@ define i32 @test414(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB414_1
+; PPC64LE-NEXT: bne- 0, .LBB414_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -7231,7 +7231,7 @@ define i64 @test415(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB415_1
+; PPC64LE-NEXT: bne- 0, .LBB415_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -7247,7 +7247,7 @@ define i64 @test416(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 3, 0, 5
; PPC64LE-NEXT: and 6, 4, 3
; PPC64LE-NEXT: stdcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB416_1
+; PPC64LE-NEXT: bne- 0, .LBB416_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -7263,7 +7263,7 @@ define i64 @test417(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB417_1
+; PPC64LE-NEXT: bne- 0, .LBB417_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -7279,7 +7279,7 @@ define i64 @test418(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB418_1
+; PPC64LE-NEXT: bne- 0, .LBB418_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -7296,7 +7296,7 @@ define i64 @test419(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: and 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB419_1
+; PPC64LE-NEXT: bne- 0, .LBB419_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -7312,7 +7312,7 @@ define i8 @test420(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB420_1
+; PPC64LE-NEXT: bne- 0, .LBB420_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -7328,7 +7328,7 @@ define i8 @test421(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 3, 0, 5
; PPC64LE-NEXT: nand 6, 4, 3
; PPC64LE-NEXT: stbcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB421_1
+; PPC64LE-NEXT: bne- 0, .LBB421_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -7344,7 +7344,7 @@ define i8 @test422(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB422_1
+; PPC64LE-NEXT: bne- 0, .LBB422_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -7360,7 +7360,7 @@ define i8 @test423(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB423_1
+; PPC64LE-NEXT: bne- 0, .LBB423_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -7377,7 +7377,7 @@ define i8 @test424(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB424_1
+; PPC64LE-NEXT: bne- 0, .LBB424_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -7393,7 +7393,7 @@ define i16 @test425(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB425_1
+; PPC64LE-NEXT: bne- 0, .LBB425_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -7409,7 +7409,7 @@ define i16 @test426(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 3, 0, 5
; PPC64LE-NEXT: nand 6, 4, 3
; PPC64LE-NEXT: sthcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB426_1
+; PPC64LE-NEXT: bne- 0, .LBB426_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -7425,7 +7425,7 @@ define i16 @test427(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB427_1
+; PPC64LE-NEXT: bne- 0, .LBB427_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -7441,7 +7441,7 @@ define i16 @test428(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB428_1
+; PPC64LE-NEXT: bne- 0, .LBB428_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -7458,7 +7458,7 @@ define i16 @test429(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB429_1
+; PPC64LE-NEXT: bne- 0, .LBB429_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -7474,7 +7474,7 @@ define i32 @test430(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB430_1
+; PPC64LE-NEXT: bne- 0, .LBB430_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -7490,7 +7490,7 @@ define i32 @test431(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 3, 0, 5
; PPC64LE-NEXT: nand 6, 4, 3
; PPC64LE-NEXT: stwcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB431_1
+; PPC64LE-NEXT: bne- 0, .LBB431_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -7506,7 +7506,7 @@ define i32 @test432(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB432_1
+; PPC64LE-NEXT: bne- 0, .LBB432_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -7522,7 +7522,7 @@ define i32 @test433(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB433_1
+; PPC64LE-NEXT: bne- 0, .LBB433_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -7539,7 +7539,7 @@ define i32 @test434(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB434_1
+; PPC64LE-NEXT: bne- 0, .LBB434_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -7555,7 +7555,7 @@ define i64 @test435(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB435_1
+; PPC64LE-NEXT: bne- 0, .LBB435_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -7571,7 +7571,7 @@ define i64 @test436(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 3, 0, 5
; PPC64LE-NEXT: nand 6, 4, 3
; PPC64LE-NEXT: stdcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB436_1
+; PPC64LE-NEXT: bne- 0, .LBB436_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -7587,7 +7587,7 @@ define i64 @test437(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB437_1
+; PPC64LE-NEXT: bne- 0, .LBB437_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -7603,7 +7603,7 @@ define i64 @test438(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB438_1
+; PPC64LE-NEXT: bne- 0, .LBB438_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -7620,7 +7620,7 @@ define i64 @test439(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: nand 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB439_1
+; PPC64LE-NEXT: bne- 0, .LBB439_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -7636,7 +7636,7 @@ define i8 @test440(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB440_1
+; PPC64LE-NEXT: bne- 0, .LBB440_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -7652,7 +7652,7 @@ define i8 @test441(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 3, 0, 5
; PPC64LE-NEXT: or 6, 4, 3
; PPC64LE-NEXT: stbcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB441_1
+; PPC64LE-NEXT: bne- 0, .LBB441_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -7668,7 +7668,7 @@ define i8 @test442(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB442_1
+; PPC64LE-NEXT: bne- 0, .LBB442_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -7684,7 +7684,7 @@ define i8 @test443(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB443_1
+; PPC64LE-NEXT: bne- 0, .LBB443_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -7701,7 +7701,7 @@ define i8 @test444(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB444_1
+; PPC64LE-NEXT: bne- 0, .LBB444_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -7717,7 +7717,7 @@ define i16 @test445(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB445_1
+; PPC64LE-NEXT: bne- 0, .LBB445_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -7733,7 +7733,7 @@ define i16 @test446(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 3, 0, 5
; PPC64LE-NEXT: or 6, 4, 3
; PPC64LE-NEXT: sthcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB446_1
+; PPC64LE-NEXT: bne- 0, .LBB446_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -7749,7 +7749,7 @@ define i16 @test447(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB447_1
+; PPC64LE-NEXT: bne- 0, .LBB447_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -7765,7 +7765,7 @@ define i16 @test448(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB448_1
+; PPC64LE-NEXT: bne- 0, .LBB448_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -7782,7 +7782,7 @@ define i16 @test449(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB449_1
+; PPC64LE-NEXT: bne- 0, .LBB449_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -7798,7 +7798,7 @@ define i32 @test450(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB450_1
+; PPC64LE-NEXT: bne- 0, .LBB450_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -7814,7 +7814,7 @@ define i32 @test451(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 3, 0, 5
; PPC64LE-NEXT: or 6, 4, 3
; PPC64LE-NEXT: stwcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB451_1
+; PPC64LE-NEXT: bne- 0, .LBB451_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -7830,7 +7830,7 @@ define i32 @test452(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB452_1
+; PPC64LE-NEXT: bne- 0, .LBB452_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -7846,7 +7846,7 @@ define i32 @test453(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB453_1
+; PPC64LE-NEXT: bne- 0, .LBB453_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -7863,7 +7863,7 @@ define i32 @test454(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB454_1
+; PPC64LE-NEXT: bne- 0, .LBB454_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -7879,7 +7879,7 @@ define i64 @test455(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB455_1
+; PPC64LE-NEXT: bne- 0, .LBB455_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -7895,7 +7895,7 @@ define i64 @test456(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 3, 0, 5
; PPC64LE-NEXT: or 6, 4, 3
; PPC64LE-NEXT: stdcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB456_1
+; PPC64LE-NEXT: bne- 0, .LBB456_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -7911,7 +7911,7 @@ define i64 @test457(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB457_1
+; PPC64LE-NEXT: bne- 0, .LBB457_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -7927,7 +7927,7 @@ define i64 @test458(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB458_1
+; PPC64LE-NEXT: bne- 0, .LBB458_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -7944,7 +7944,7 @@ define i64 @test459(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: or 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB459_1
+; PPC64LE-NEXT: bne- 0, .LBB459_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -7960,7 +7960,7 @@ define i8 @test460(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB460_1
+; PPC64LE-NEXT: bne- 0, .LBB460_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -7976,7 +7976,7 @@ define i8 @test461(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 3, 0, 5
; PPC64LE-NEXT: xor 6, 4, 3
; PPC64LE-NEXT: stbcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB461_1
+; PPC64LE-NEXT: bne- 0, .LBB461_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -7992,7 +7992,7 @@ define i8 @test462(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB462_1
+; PPC64LE-NEXT: bne- 0, .LBB462_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -8008,7 +8008,7 @@ define i8 @test463(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB463_1
+; PPC64LE-NEXT: bne- 0, .LBB463_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -8025,7 +8025,7 @@ define i8 @test464(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: lbarx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: stbcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB464_1
+; PPC64LE-NEXT: bne- 0, .LBB464_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -8041,7 +8041,7 @@ define i16 @test465(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB465_1
+; PPC64LE-NEXT: bne- 0, .LBB465_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -8057,7 +8057,7 @@ define i16 @test466(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 3, 0, 5
; PPC64LE-NEXT: xor 6, 4, 3
; PPC64LE-NEXT: sthcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB466_1
+; PPC64LE-NEXT: bne- 0, .LBB466_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -8073,7 +8073,7 @@ define i16 @test467(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB467_1
+; PPC64LE-NEXT: bne- 0, .LBB467_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -8089,7 +8089,7 @@ define i16 @test468(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB468_1
+; PPC64LE-NEXT: bne- 0, .LBB468_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -8106,7 +8106,7 @@ define i16 @test469(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: lharx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: sthcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB469_1
+; PPC64LE-NEXT: bne- 0, .LBB469_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -8122,7 +8122,7 @@ define i32 @test470(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB470_1
+; PPC64LE-NEXT: bne- 0, .LBB470_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -8138,7 +8138,7 @@ define i32 @test471(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 3, 0, 5
; PPC64LE-NEXT: xor 6, 4, 3
; PPC64LE-NEXT: stwcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB471_1
+; PPC64LE-NEXT: bne- 0, .LBB471_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -8154,7 +8154,7 @@ define i32 @test472(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB472_1
+; PPC64LE-NEXT: bne- 0, .LBB472_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -8170,7 +8170,7 @@ define i32 @test473(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB473_1
+; PPC64LE-NEXT: bne- 0, .LBB473_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -8187,7 +8187,7 @@ define i32 @test474(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: lwarx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: stwcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB474_1
+; PPC64LE-NEXT: bne- 0, .LBB474_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -8203,7 +8203,7 @@ define i64 @test475(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB475_1
+; PPC64LE-NEXT: bne- 0, .LBB475_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -8219,7 +8219,7 @@ define i64 @test476(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 3, 0, 5
; PPC64LE-NEXT: xor 6, 4, 3
; PPC64LE-NEXT: stdcx. 6, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB476_1
+; PPC64LE-NEXT: bne- 0, .LBB476_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -8235,7 +8235,7 @@ define i64 @test477(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB477_1
+; PPC64LE-NEXT: bne- 0, .LBB477_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -8251,7 +8251,7 @@ define i64 @test478(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB478_1
+; PPC64LE-NEXT: bne- 0, .LBB478_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -8268,7 +8268,7 @@ define i64 @test479(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: ldarx 5, 0, 3
; PPC64LE-NEXT: xor 6, 4, 5
; PPC64LE-NEXT: stdcx. 6, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB479_1
+; PPC64LE-NEXT: bne- 0, .LBB479_1
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -8288,7 +8288,7 @@ define i8 @test480(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: bgt 0, .LBB480_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB480_1
+; PPC64LE-NEXT: bne- 0, .LBB480_1
; PPC64LE-NEXT: .LBB480_3:
; PPC64LE-NEXT: mr 3, 4
; PPC64LE-NEXT: blr
@@ -8307,7 +8307,7 @@ define i8 @test481(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: bgt 0, .LBB481_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB481_1
+; PPC64LE-NEXT: bne- 0, .LBB481_1
; PPC64LE-NEXT: .LBB481_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 4
@@ -8328,7 +8328,7 @@ define i8 @test482(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: bgt 0, .LBB482_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB482_1
+; PPC64LE-NEXT: bne- 0, .LBB482_1
; PPC64LE-NEXT: .LBB482_3:
; PPC64LE-NEXT: mr 3, 4
; PPC64LE-NEXT: blr
@@ -8348,7 +8348,7 @@ define i8 @test483(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: bgt 0, .LBB483_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB483_1
+; PPC64LE-NEXT: bne- 0, .LBB483_1
; PPC64LE-NEXT: .LBB483_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 4
@@ -8369,7 +8369,7 @@ define i8 @test484(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: bgt 0, .LBB484_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB484_1
+; PPC64LE-NEXT: bne- 0, .LBB484_1
; PPC64LE-NEXT: .LBB484_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 4
@@ -8389,7 +8389,7 @@ define i16 @test485(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: bgt 0, .LBB485_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB485_1
+; PPC64LE-NEXT: bne- 0, .LBB485_1
; PPC64LE-NEXT: .LBB485_3:
; PPC64LE-NEXT: mr 3, 4
; PPC64LE-NEXT: blr
@@ -8408,7 +8408,7 @@ define i16 @test486(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: bgt 0, .LBB486_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB486_1
+; PPC64LE-NEXT: bne- 0, .LBB486_1
; PPC64LE-NEXT: .LBB486_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 4
@@ -8429,7 +8429,7 @@ define i16 @test487(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: bgt 0, .LBB487_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB487_1
+; PPC64LE-NEXT: bne- 0, .LBB487_1
; PPC64LE-NEXT: .LBB487_3:
; PPC64LE-NEXT: mr 3, 4
; PPC64LE-NEXT: blr
@@ -8449,7 +8449,7 @@ define i16 @test488(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: bgt 0, .LBB488_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB488_1
+; PPC64LE-NEXT: bne- 0, .LBB488_1
; PPC64LE-NEXT: .LBB488_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 4
@@ -8470,7 +8470,7 @@ define i16 @test489(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: bgt 0, .LBB489_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB489_1
+; PPC64LE-NEXT: bne- 0, .LBB489_1
; PPC64LE-NEXT: .LBB489_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 4
@@ -8488,7 +8488,7 @@ define i32 @test490(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: bgt 0, .LBB490_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB490_1
+; PPC64LE-NEXT: bne- 0, .LBB490_1
; PPC64LE-NEXT: .LBB490_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -8506,7 +8506,7 @@ define i32 @test491(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: bgt 0, .LBB491_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB491_1
+; PPC64LE-NEXT: bne- 0, .LBB491_1
; PPC64LE-NEXT: .LBB491_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -8524,7 +8524,7 @@ define i32 @test492(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: bgt 0, .LBB492_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB492_1
+; PPC64LE-NEXT: bne- 0, .LBB492_1
; PPC64LE-NEXT: .LBB492_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -8542,7 +8542,7 @@ define i32 @test493(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: bgt 0, .LBB493_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB493_1
+; PPC64LE-NEXT: bne- 0, .LBB493_1
; PPC64LE-NEXT: .LBB493_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -8561,7 +8561,7 @@ define i32 @test494(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: bgt 0, .LBB494_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB494_1
+; PPC64LE-NEXT: bne- 0, .LBB494_1
; PPC64LE-NEXT: .LBB494_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -8579,7 +8579,7 @@ define i64 @test495(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: bgt 0, .LBB495_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB495_1
+; PPC64LE-NEXT: bne- 0, .LBB495_1
; PPC64LE-NEXT: .LBB495_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -8597,7 +8597,7 @@ define i64 @test496(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: bgt 0, .LBB496_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB496_1
+; PPC64LE-NEXT: bne- 0, .LBB496_1
; PPC64LE-NEXT: .LBB496_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -8615,7 +8615,7 @@ define i64 @test497(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: bgt 0, .LBB497_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB497_1
+; PPC64LE-NEXT: bne- 0, .LBB497_1
; PPC64LE-NEXT: .LBB497_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -8633,7 +8633,7 @@ define i64 @test498(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: bgt 0, .LBB498_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB498_1
+; PPC64LE-NEXT: bne- 0, .LBB498_1
; PPC64LE-NEXT: .LBB498_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -8652,7 +8652,7 @@ define i64 @test499(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: bgt 0, .LBB499_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB499_1
+; PPC64LE-NEXT: bne- 0, .LBB499_1
; PPC64LE-NEXT: .LBB499_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -8672,7 +8672,7 @@ define i8 @test500(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: blt 0, .LBB500_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB500_1
+; PPC64LE-NEXT: bne- 0, .LBB500_1
; PPC64LE-NEXT: .LBB500_3:
; PPC64LE-NEXT: mr 3, 4
; PPC64LE-NEXT: blr
@@ -8691,7 +8691,7 @@ define i8 @test501(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: blt 0, .LBB501_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB501_1
+; PPC64LE-NEXT: bne- 0, .LBB501_1
; PPC64LE-NEXT: .LBB501_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 4
@@ -8712,7 +8712,7 @@ define i8 @test502(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: blt 0, .LBB502_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB502_1
+; PPC64LE-NEXT: bne- 0, .LBB502_1
; PPC64LE-NEXT: .LBB502_3:
; PPC64LE-NEXT: mr 3, 4
; PPC64LE-NEXT: blr
@@ -8732,7 +8732,7 @@ define i8 @test503(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: blt 0, .LBB503_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB503_1
+; PPC64LE-NEXT: bne- 0, .LBB503_1
; PPC64LE-NEXT: .LBB503_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 4
@@ -8753,7 +8753,7 @@ define i8 @test504(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: blt 0, .LBB504_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB504_1
+; PPC64LE-NEXT: bne- 0, .LBB504_1
; PPC64LE-NEXT: .LBB504_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 4
@@ -8773,7 +8773,7 @@ define i16 @test505(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: blt 0, .LBB505_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB505_1
+; PPC64LE-NEXT: bne- 0, .LBB505_1
; PPC64LE-NEXT: .LBB505_3:
; PPC64LE-NEXT: mr 3, 4
; PPC64LE-NEXT: blr
@@ -8792,7 +8792,7 @@ define i16 @test506(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: blt 0, .LBB506_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB506_1
+; PPC64LE-NEXT: bne- 0, .LBB506_1
; PPC64LE-NEXT: .LBB506_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 4
@@ -8813,7 +8813,7 @@ define i16 @test507(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: blt 0, .LBB507_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB507_1
+; PPC64LE-NEXT: bne- 0, .LBB507_1
; PPC64LE-NEXT: .LBB507_3:
; PPC64LE-NEXT: mr 3, 4
; PPC64LE-NEXT: blr
@@ -8833,7 +8833,7 @@ define i16 @test508(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: blt 0, .LBB508_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB508_1
+; PPC64LE-NEXT: bne- 0, .LBB508_1
; PPC64LE-NEXT: .LBB508_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 4
@@ -8854,7 +8854,7 @@ define i16 @test509(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: blt 0, .LBB509_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 5, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB509_1
+; PPC64LE-NEXT: bne- 0, .LBB509_1
; PPC64LE-NEXT: .LBB509_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 4
@@ -8872,7 +8872,7 @@ define i32 @test510(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: blt 0, .LBB510_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB510_1
+; PPC64LE-NEXT: bne- 0, .LBB510_1
; PPC64LE-NEXT: .LBB510_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -8890,7 +8890,7 @@ define i32 @test511(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: blt 0, .LBB511_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB511_1
+; PPC64LE-NEXT: bne- 0, .LBB511_1
; PPC64LE-NEXT: .LBB511_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -8908,7 +8908,7 @@ define i32 @test512(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: blt 0, .LBB512_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB512_1
+; PPC64LE-NEXT: bne- 0, .LBB512_1
; PPC64LE-NEXT: .LBB512_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -8926,7 +8926,7 @@ define i32 @test513(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: blt 0, .LBB513_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB513_1
+; PPC64LE-NEXT: bne- 0, .LBB513_1
; PPC64LE-NEXT: .LBB513_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -8945,7 +8945,7 @@ define i32 @test514(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: blt 0, .LBB514_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB514_1
+; PPC64LE-NEXT: bne- 0, .LBB514_1
; PPC64LE-NEXT: .LBB514_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -8963,7 +8963,7 @@ define i64 @test515(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: blt 0, .LBB515_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB515_1
+; PPC64LE-NEXT: bne- 0, .LBB515_1
; PPC64LE-NEXT: .LBB515_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -8981,7 +8981,7 @@ define i64 @test516(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: blt 0, .LBB516_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB516_1
+; PPC64LE-NEXT: bne- 0, .LBB516_1
; PPC64LE-NEXT: .LBB516_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -8999,7 +8999,7 @@ define i64 @test517(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: blt 0, .LBB517_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB517_1
+; PPC64LE-NEXT: bne- 0, .LBB517_1
; PPC64LE-NEXT: .LBB517_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -9017,7 +9017,7 @@ define i64 @test518(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: blt 0, .LBB518_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB518_1
+; PPC64LE-NEXT: bne- 0, .LBB518_1
; PPC64LE-NEXT: .LBB518_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -9036,7 +9036,7 @@ define i64 @test519(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: blt 0, .LBB519_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB519_1
+; PPC64LE-NEXT: bne- 0, .LBB519_1
; PPC64LE-NEXT: .LBB519_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -9054,7 +9054,7 @@ define i8 @test520(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: bgt 0, .LBB520_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB520_1
+; PPC64LE-NEXT: bne- 0, .LBB520_1
; PPC64LE-NEXT: .LBB520_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -9072,7 +9072,7 @@ define i8 @test521(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: bgt 0, .LBB521_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB521_1
+; PPC64LE-NEXT: bne- 0, .LBB521_1
; PPC64LE-NEXT: .LBB521_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -9090,7 +9090,7 @@ define i8 @test522(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: bgt 0, .LBB522_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB522_1
+; PPC64LE-NEXT: bne- 0, .LBB522_1
; PPC64LE-NEXT: .LBB522_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -9108,7 +9108,7 @@ define i8 @test523(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: bgt 0, .LBB523_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB523_1
+; PPC64LE-NEXT: bne- 0, .LBB523_1
; PPC64LE-NEXT: .LBB523_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -9127,7 +9127,7 @@ define i8 @test524(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: bgt 0, .LBB524_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB524_1
+; PPC64LE-NEXT: bne- 0, .LBB524_1
; PPC64LE-NEXT: .LBB524_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -9145,7 +9145,7 @@ define i16 @test525(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: bgt 0, .LBB525_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB525_1
+; PPC64LE-NEXT: bne- 0, .LBB525_1
; PPC64LE-NEXT: .LBB525_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -9163,7 +9163,7 @@ define i16 @test526(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: bgt 0, .LBB526_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB526_1
+; PPC64LE-NEXT: bne- 0, .LBB526_1
; PPC64LE-NEXT: .LBB526_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -9181,7 +9181,7 @@ define i16 @test527(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: bgt 0, .LBB527_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB527_1
+; PPC64LE-NEXT: bne- 0, .LBB527_1
; PPC64LE-NEXT: .LBB527_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -9199,7 +9199,7 @@ define i16 @test528(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: bgt 0, .LBB528_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB528_1
+; PPC64LE-NEXT: bne- 0, .LBB528_1
; PPC64LE-NEXT: .LBB528_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -9218,7 +9218,7 @@ define i16 @test529(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: bgt 0, .LBB529_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB529_1
+; PPC64LE-NEXT: bne- 0, .LBB529_1
; PPC64LE-NEXT: .LBB529_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -9236,7 +9236,7 @@ define i32 @test530(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: bgt 0, .LBB530_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB530_1
+; PPC64LE-NEXT: bne- 0, .LBB530_1
; PPC64LE-NEXT: .LBB530_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -9254,7 +9254,7 @@ define i32 @test531(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: bgt 0, .LBB531_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB531_1
+; PPC64LE-NEXT: bne- 0, .LBB531_1
; PPC64LE-NEXT: .LBB531_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -9272,7 +9272,7 @@ define i32 @test532(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: bgt 0, .LBB532_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB532_1
+; PPC64LE-NEXT: bne- 0, .LBB532_1
; PPC64LE-NEXT: .LBB532_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -9290,7 +9290,7 @@ define i32 @test533(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: bgt 0, .LBB533_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB533_1
+; PPC64LE-NEXT: bne- 0, .LBB533_1
; PPC64LE-NEXT: .LBB533_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -9309,7 +9309,7 @@ define i32 @test534(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: bgt 0, .LBB534_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB534_1
+; PPC64LE-NEXT: bne- 0, .LBB534_1
; PPC64LE-NEXT: .LBB534_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -9327,7 +9327,7 @@ define i64 @test535(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: bgt 0, .LBB535_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB535_1
+; PPC64LE-NEXT: bne- 0, .LBB535_1
; PPC64LE-NEXT: .LBB535_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -9345,7 +9345,7 @@ define i64 @test536(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: bgt 0, .LBB536_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB536_1
+; PPC64LE-NEXT: bne- 0, .LBB536_1
; PPC64LE-NEXT: .LBB536_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -9363,7 +9363,7 @@ define i64 @test537(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: bgt 0, .LBB537_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB537_1
+; PPC64LE-NEXT: bne- 0, .LBB537_1
; PPC64LE-NEXT: .LBB537_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -9381,7 +9381,7 @@ define i64 @test538(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: bgt 0, .LBB538_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB538_1
+; PPC64LE-NEXT: bne- 0, .LBB538_1
; PPC64LE-NEXT: .LBB538_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -9400,7 +9400,7 @@ define i64 @test539(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: bgt 0, .LBB539_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB539_1
+; PPC64LE-NEXT: bne- 0, .LBB539_1
; PPC64LE-NEXT: .LBB539_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -9418,7 +9418,7 @@ define i8 @test540(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: blt 0, .LBB540_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB540_1
+; PPC64LE-NEXT: bne- 0, .LBB540_1
; PPC64LE-NEXT: .LBB540_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -9436,7 +9436,7 @@ define i8 @test541(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: blt 0, .LBB541_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB541_1
+; PPC64LE-NEXT: bne- 0, .LBB541_1
; PPC64LE-NEXT: .LBB541_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -9454,7 +9454,7 @@ define i8 @test542(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: blt 0, .LBB542_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB542_1
+; PPC64LE-NEXT: bne- 0, .LBB542_1
; PPC64LE-NEXT: .LBB542_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -9472,7 +9472,7 @@ define i8 @test543(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: blt 0, .LBB543_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB543_1
+; PPC64LE-NEXT: bne- 0, .LBB543_1
; PPC64LE-NEXT: .LBB543_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -9491,7 +9491,7 @@ define i8 @test544(ptr %ptr, i8 %val) {
; PPC64LE-NEXT: blt 0, .LBB544_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stbcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB544_1
+; PPC64LE-NEXT: bne- 0, .LBB544_1
; PPC64LE-NEXT: .LBB544_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -9509,7 +9509,7 @@ define i16 @test545(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: blt 0, .LBB545_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB545_1
+; PPC64LE-NEXT: bne- 0, .LBB545_1
; PPC64LE-NEXT: .LBB545_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -9527,7 +9527,7 @@ define i16 @test546(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: blt 0, .LBB546_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB546_1
+; PPC64LE-NEXT: bne- 0, .LBB546_1
; PPC64LE-NEXT: .LBB546_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -9545,7 +9545,7 @@ define i16 @test547(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: blt 0, .LBB547_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB547_1
+; PPC64LE-NEXT: bne- 0, .LBB547_1
; PPC64LE-NEXT: .LBB547_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -9563,7 +9563,7 @@ define i16 @test548(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: blt 0, .LBB548_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB548_1
+; PPC64LE-NEXT: bne- 0, .LBB548_1
; PPC64LE-NEXT: .LBB548_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -9582,7 +9582,7 @@ define i16 @test549(ptr %ptr, i16 %val) {
; PPC64LE-NEXT: blt 0, .LBB549_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: sthcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB549_1
+; PPC64LE-NEXT: bne- 0, .LBB549_1
; PPC64LE-NEXT: .LBB549_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -9600,7 +9600,7 @@ define i32 @test550(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: blt 0, .LBB550_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB550_1
+; PPC64LE-NEXT: bne- 0, .LBB550_1
; PPC64LE-NEXT: .LBB550_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -9618,7 +9618,7 @@ define i32 @test551(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: blt 0, .LBB551_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB551_1
+; PPC64LE-NEXT: bne- 0, .LBB551_1
; PPC64LE-NEXT: .LBB551_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -9636,7 +9636,7 @@ define i32 @test552(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: blt 0, .LBB552_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB552_1
+; PPC64LE-NEXT: bne- 0, .LBB552_1
; PPC64LE-NEXT: .LBB552_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -9654,7 +9654,7 @@ define i32 @test553(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: blt 0, .LBB553_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB553_1
+; PPC64LE-NEXT: bne- 0, .LBB553_1
; PPC64LE-NEXT: .LBB553_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -9673,7 +9673,7 @@ define i32 @test554(ptr %ptr, i32 %val) {
; PPC64LE-NEXT: blt 0, .LBB554_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stwcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB554_1
+; PPC64LE-NEXT: bne- 0, .LBB554_1
; PPC64LE-NEXT: .LBB554_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -9691,7 +9691,7 @@ define i64 @test555(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: blt 0, .LBB555_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB555_1
+; PPC64LE-NEXT: bne- 0, .LBB555_1
; PPC64LE-NEXT: .LBB555_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -9709,7 +9709,7 @@ define i64 @test556(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: blt 0, .LBB556_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 5
-; PPC64LE-NEXT: bne 0, .LBB556_1
+; PPC64LE-NEXT: bne- 0, .LBB556_1
; PPC64LE-NEXT: .LBB556_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
@@ -9727,7 +9727,7 @@ define i64 @test557(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: blt 0, .LBB557_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB557_1
+; PPC64LE-NEXT: bne- 0, .LBB557_1
; PPC64LE-NEXT: .LBB557_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
@@ -9745,7 +9745,7 @@ define i64 @test558(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: blt 0, .LBB558_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB558_1
+; PPC64LE-NEXT: bne- 0, .LBB558_1
; PPC64LE-NEXT: .LBB558_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
@@ -9764,7 +9764,7 @@ define i64 @test559(ptr %ptr, i64 %val) {
; PPC64LE-NEXT: blt 0, .LBB559_3
; PPC64LE-NEXT: # %bb.2:
; PPC64LE-NEXT: stdcx. 4, 0, 3
-; PPC64LE-NEXT: bne 0, .LBB559_1
+; PPC64LE-NEXT: bne- 0, .LBB559_1
; PPC64LE-NEXT: .LBB559_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: mr 3, 5
diff --git a/llvm/test/CodeGen/PowerPC/atomics.ll b/llvm/test/CodeGen/PowerPC/atomics.ll
index 183c8e1..ff1a722 100644
--- a/llvm/test/CodeGen/PowerPC/atomics.ll
+++ b/llvm/test/CodeGen/PowerPC/atomics.ll
@@ -341,7 +341,7 @@ define i8 @add_i8_monotonic(ptr %mem, i8 %operand) {
; PPC32-NEXT: and r8, r8, r6
; PPC32-NEXT: or r8, r8, r9
; PPC32-NEXT: stwcx. r8, 0, r5
-; PPC32-NEXT: bne cr0, .LBB12_1
+; PPC32-NEXT: bne- cr0, .LBB12_1
; PPC32-NEXT: # %bb.2:
; PPC32-NEXT: srw r3, r7, r3
; PPC32-NEXT: clrlwi r3, r3, 24
@@ -362,7 +362,7 @@ define i8 @add_i8_monotonic(ptr %mem, i8 %operand) {
; PPC64-NEXT: and r8, r8, r6
; PPC64-NEXT: or r8, r8, r9
; PPC64-NEXT: stwcx. r8, 0, r5
-; PPC64-NEXT: bne cr0, .LBB12_1
+; PPC64-NEXT: bne- cr0, .LBB12_1
; PPC64-NEXT: # %bb.2:
; PPC64-NEXT: srw r3, r7, r3
; PPC64-NEXT: clrlwi r3, r3, 24
@@ -388,7 +388,7 @@ define i16 @xor_i16_seq_cst(ptr %mem, i16 %operand) {
; PPC32-NEXT: and r8, r8, r6
; PPC32-NEXT: or r8, r8, r9
; PPC32-NEXT: stwcx. r8, 0, r3
-; PPC32-NEXT: bne cr0, .LBB13_1
+; PPC32-NEXT: bne- cr0, .LBB13_1
; PPC32-NEXT: # %bb.2:
; PPC32-NEXT: srw r3, r7, r5
; PPC32-NEXT: clrlwi r3, r3, 16
@@ -412,7 +412,7 @@ define i16 @xor_i16_seq_cst(ptr %mem, i16 %operand) {
; PPC64-NEXT: and r8, r8, r6
; PPC64-NEXT: or r8, r8, r9
; PPC64-NEXT: stwcx. r8, 0, r3
-; PPC64-NEXT: bne cr0, .LBB13_1
+; PPC64-NEXT: bne- cr0, .LBB13_1
; PPC64-NEXT: # %bb.2:
; PPC64-NEXT: srw r3, r7, r5
; PPC64-NEXT: clrlwi r3, r3, 16
@@ -428,7 +428,7 @@ define i32 @xchg_i32_acq_rel(ptr %mem, i32 %operand) {
; CHECK-NEXT: .LBB14_1:
; CHECK-NEXT: lwarx r5, 0, r3
; CHECK-NEXT: stwcx. r4, 0, r3
-; CHECK-NEXT: bne cr0, .LBB14_1
+; CHECK-NEXT: bne- cr0, .LBB14_1
; CHECK-NEXT: # %bb.2:
; CHECK-NEXT: mr r3, r5
; CHECK-NEXT: lwsync
@@ -458,7 +458,7 @@ define i64 @and_i64_release(ptr %mem, i64 %operand) {
; PPC64-NEXT: ldarx r5, 0, r3
; PPC64-NEXT: and r6, r4, r5
; PPC64-NEXT: stdcx. r6, 0, r3
-; PPC64-NEXT: bne cr0, .LBB15_1
+; PPC64-NEXT: bne- cr0, .LBB15_1
; PPC64-NEXT: # %bb.2:
; PPC64-NEXT: mr r3, r5
; PPC64-NEXT: blr
diff --git a/llvm/test/CodeGen/PowerPC/i64_fp_round.ll b/llvm/test/CodeGen/PowerPC/i64_fp_round.ll
index f7df003..b1fb907 100644
--- a/llvm/test/CodeGen/PowerPC/i64_fp_round.ll
+++ b/llvm/test/CodeGen/PowerPC/i64_fp_round.ll
@@ -4,10 +4,9 @@
; for minor code generation differences.
; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-fpcvt < %s | FileCheck %s
; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-fpcvt -mattr=-isel < %s | FileCheck %s --check-prefix=CHECK-NO-ISEL
-; Also check that with -enable-unsafe-fp-math we do not get that extra
+; Also check that with fpexcept.ignore we do not get that extra
; code sequence. Simply verify that there is no "isel" present.
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-fpcvt -enable-unsafe-fp-math < %s | FileCheck %s -check-prefix=CHECK-UNSAFE
-; CHECK-UNSAFE-NOT: isel
+
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
target triple = "powerpc64-unknown-linux-gnu"
@@ -15,9 +14,8 @@ define float @test(i64 %x) nounwind readnone {
; Verify that we get the code sequence needed to avoid double-rounding.
; Note that only parts of the sequence are checked for here, to allow
; for minor code generation differences.
-; Also check that with -enable-unsafe-fp-math we do not get that extra
+; Also check that with fpexcept.ignore we do not get that extra
; code sequence. Simply verify that there is no "isel" present.
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-fpcvt -enable-unsafe-fp-math < %s | FileCheck %s -check-prefix=CHECK-UNSAFE
; CHECK-LABEL: test:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: clrldi 4, 3, 53
@@ -51,18 +49,10 @@ define float @test(i64 %x) nounwind readnone {
; CHECK-NO-ISEL-NEXT: xscvsxddp 0, 0
; CHECK-NO-ISEL-NEXT: frsp 1, 0
; CHECK-NO-ISEL-NEXT: blr
-;
-; CHECK-UNSAFE-LABEL: test:
-; CHECK-UNSAFE: # %bb.0: # %entry
-; CHECK-UNSAFE-NEXT: std 3, -8(1)
-; CHECK-UNSAFE-NEXT: lfd 0, -8(1)
-; CHECK-UNSAFE-NEXT: xscvsxddp 0, 0
-; CHECK-UNSAFE-NEXT: frsp 1, 0
-; CHECK-UNSAFE-NEXT: blr
entry:
%conv = sitofp i64 %x to float
ret float %conv
}
-
+; TODO: Add sitofp afn test.
diff --git a/llvm/test/CodeGen/PowerPC/ppc-partword-atomic.ll b/llvm/test/CodeGen/PowerPC/ppc-partword-atomic.ll
index 9e3eea1..a6f5c61 100644
--- a/llvm/test/CodeGen/PowerPC/ppc-partword-atomic.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc-partword-atomic.ll
@@ -25,7 +25,7 @@ define dso_local zeroext i32 @testI8(i8 zeroext %val) local_unnamed_addr #0 {
; PWR7-NEXT: andc 8, 7, 3
; PWR7-NEXT: or 8, 6, 8
; PWR7-NEXT: stwcx. 8, 0, 5
-; PWR7-NEXT: bne 0, .LBB0_1
+; PWR7-NEXT: bne- 0, .LBB0_1
; PWR7-NEXT: # %bb.2: # %entry
; PWR7-NEXT: srw 3, 7, 4
; PWR7-NEXT: addis 4, 2, global_int@toc@ha
@@ -44,7 +44,7 @@ define dso_local zeroext i32 @testI8(i8 zeroext %val) local_unnamed_addr #0 {
; PWR9-NEXT: #
; PWR9-NEXT: lbarx 4, 0, 5
; PWR9-NEXT: stbcx. 3, 0, 5
-; PWR9-NEXT: bne 0, .LBB0_1
+; PWR9-NEXT: bne- 0, .LBB0_1
; PWR9-NEXT: # %bb.2: # %entry
; PWR9-NEXT: addis 3, 2, global_int@toc@ha
; PWR9-NEXT: lwsync
@@ -78,7 +78,7 @@ define dso_local zeroext i32 @testI16(i16 zeroext %val) local_unnamed_addr #0 {
; PWR7-NEXT: andc 8, 7, 3
; PWR7-NEXT: or 8, 6, 8
; PWR7-NEXT: stwcx. 8, 0, 5
-; PWR7-NEXT: bne 0, .LBB1_1
+; PWR7-NEXT: bne- 0, .LBB1_1
; PWR7-NEXT: # %bb.2: # %entry
; PWR7-NEXT: srw 3, 7, 4
; PWR7-NEXT: addis 4, 2, global_int@toc@ha
@@ -97,7 +97,7 @@ define dso_local zeroext i32 @testI16(i16 zeroext %val) local_unnamed_addr #0 {
; PWR9-NEXT: #
; PWR9-NEXT: lharx 4, 0, 5
; PWR9-NEXT: sthcx. 3, 0, 5
-; PWR9-NEXT: bne 0, .LBB1_1
+; PWR9-NEXT: bne- 0, .LBB1_1
; PWR9-NEXT: # %bb.2: # %entry
; PWR9-NEXT: addis 3, 2, global_int@toc@ha
; PWR9-NEXT: lwsync
diff --git a/llvm/test/CodeGen/PowerPC/pr61882.ll b/llvm/test/CodeGen/PowerPC/pr61882.ll
index c649fe0..062d97c 100644
--- a/llvm/test/CodeGen/PowerPC/pr61882.ll
+++ b/llvm/test/CodeGen/PowerPC/pr61882.ll
@@ -27,7 +27,7 @@ define void @foo(ptr %a, i32 %x) {
; CHECK-NEXT: andc r8, r8, r6
; CHECK-NEXT: or r8, r7, r8
; CHECK-NEXT: stwcx. r8, 0, r3
-; CHECK-NEXT: bne cr0, .LBB0_1
+; CHECK-NEXT: bne- cr0, .LBB0_1
; CHECK-NEXT: .LBB0_3:
; CHECK-NEXT: lwsync
; CHECK-NEXT: blr
@@ -43,7 +43,7 @@ define void @foo(ptr %a, i32 %x) {
; PWR8-NEXT: bgt cr0, .LBB0_3
; PWR8-NEXT: # %bb.2:
; PWR8-NEXT: stbcx. r4, 0, r3
-; PWR8-NEXT: bne cr0, .LBB0_1
+; PWR8-NEXT: bne- cr0, .LBB0_1
; PWR8-NEXT: .LBB0_3:
; PWR8-NEXT: lwsync
; PWR8-NEXT: blr
diff --git a/llvm/test/CodeGen/PowerPC/scalar-rounding-ops.ll b/llvm/test/CodeGen/PowerPC/scalar-rounding-ops.ll
index 2be370f..af48bf2 100644
--- a/llvm/test/CodeGen/PowerPC/scalar-rounding-ops.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar-rounding-ops.ll
@@ -5,9 +5,6 @@
; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
; RUN: -mtriple=powerpc64le-unknown-unknown -verify-machineinstrs < %s | \
; RUN: FileCheck %s
-; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
-; RUN: -mtriple=powerpc64le-unknown-unknown -verify-machineinstrs < %s \
-; RUN: --enable-unsafe-fp-math | FileCheck %s --check-prefix=FAST
define dso_local i64 @test_lrint(double %d) local_unnamed_addr {
; BE-LABEL: test_lrint:
; BE: # %bb.0: # %entry
@@ -36,17 +33,36 @@ define dso_local i64 @test_lrint(double %d) local_unnamed_addr {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: test_lrint:
-; FAST: # %bb.0: # %entry
-; FAST-NEXT: fctid f0, f1
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: blr
entry:
%0 = tail call i64 @llvm.lrint.i64.f64(double %d)
ret i64 %0
}
+define dso_local i64 @test_constrained_lrint(double %d) local_unnamed_addr {
+; BE-LABEL: test_constrained_lrint:
+; BE: # %bb.0: # %entry
+; BE-NEXT: mflr r0
+; BE-NEXT: stdu r1, -112(r1)
+; BE-NEXT: std r0, 128(r1)
+; BE-NEXT: .cfi_def_cfa_offset 112
+; BE-NEXT: .cfi_offset lr, 16
+; BE-NEXT: bl lrint
+; BE-NEXT: nop
+; BE-NEXT: addi r1, r1, 112
+; BE-NEXT: ld r0, 16(r1)
+; BE-NEXT: mtlr r0
+; BE-NEXT: blr
+;
+; CHECK-LABEL: test_constrained_lrint:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fctid f0, f1
+; CHECK-NEXT: mffprd r3, f0
+; CHECK-NEXT: blr
+entry:
+ %0 = tail call i64 @llvm.experimental.constrained.lrint(double %d, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret i64 %0
+}
+
declare i64 @llvm.lrint.i64.f64(double)
define dso_local i64 @test_lrintf(float %f) local_unnamed_addr {
@@ -77,17 +93,36 @@ define dso_local i64 @test_lrintf(float %f) local_unnamed_addr {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: test_lrintf:
-; FAST: # %bb.0: # %entry
-; FAST-NEXT: fctid f0, f1
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: blr
entry:
%0 = tail call i64 @llvm.lrint.i64.f32(float %f)
ret i64 %0
}
+define dso_local i64 @test_constrained_lrintf(float %f) local_unnamed_addr {
+; BE-LABEL: test_constrained_lrintf:
+; BE: # %bb.0: # %entry
+; BE-NEXT: mflr r0
+; BE-NEXT: stdu r1, -112(r1)
+; BE-NEXT: std r0, 128(r1)
+; BE-NEXT: .cfi_def_cfa_offset 112
+; BE-NEXT: .cfi_offset lr, 16
+; BE-NEXT: bl lrintf
+; BE-NEXT: nop
+; BE-NEXT: addi r1, r1, 112
+; BE-NEXT: ld r0, 16(r1)
+; BE-NEXT: mtlr r0
+; BE-NEXT: blr
+;
+; CHECK-LABEL: test_constrained_lrintf:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fctid f0, f1
+; CHECK-NEXT: mffprd r3, f0
+; CHECK-NEXT: blr
+entry:
+ %0 = tail call i64 @llvm.experimental.constrained.lrint(float %f, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret i64 %0
+}
+
declare i64 @llvm.lrint.i64.f32(float)
define dso_local i64 @test_llrint(double %d) local_unnamed_addr {
@@ -118,17 +153,36 @@ define dso_local i64 @test_llrint(double %d) local_unnamed_addr {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: test_llrint:
-; FAST: # %bb.0: # %entry
-; FAST-NEXT: fctid f0, f1
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: blr
entry:
%0 = tail call i64 @llvm.llrint.i64.f64(double %d)
ret i64 %0
}
+define dso_local i64 @test_constrained_llrint(double %d) local_unnamed_addr {
+; BE-LABEL: test_constrained_llrint:
+; BE: # %bb.0: # %entry
+; BE-NEXT: mflr r0
+; BE-NEXT: stdu r1, -112(r1)
+; BE-NEXT: std r0, 128(r1)
+; BE-NEXT: .cfi_def_cfa_offset 112
+; BE-NEXT: .cfi_offset lr, 16
+; BE-NEXT: bl llrint
+; BE-NEXT: nop
+; BE-NEXT: addi r1, r1, 112
+; BE-NEXT: ld r0, 16(r1)
+; BE-NEXT: mtlr r0
+; BE-NEXT: blr
+;
+; CHECK-LABEL: test_constrained_llrint:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fctid f0, f1
+; CHECK-NEXT: mffprd r3, f0
+; CHECK-NEXT: blr
+entry:
+ %0 = tail call i64 @llvm.experimental.constrained.llrint(double %d, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret i64 %0
+}
+
declare i64 @llvm.llrint.i64.f64(double)
define dso_local i64 @test_llrintf(float %f) local_unnamed_addr {
@@ -159,17 +213,36 @@ define dso_local i64 @test_llrintf(float %f) local_unnamed_addr {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: test_llrintf:
-; FAST: # %bb.0: # %entry
-; FAST-NEXT: fctid f0, f1
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: blr
entry:
%0 = tail call i64 @llvm.llrint.i64.f32(float %f)
ret i64 %0
}
+define dso_local i64 @test_constrained_llrintf(float %f) local_unnamed_addr {
+; BE-LABEL: test_constrained_llrintf:
+; BE: # %bb.0: # %entry
+; BE-NEXT: mflr r0
+; BE-NEXT: stdu r1, -112(r1)
+; BE-NEXT: std r0, 128(r1)
+; BE-NEXT: .cfi_def_cfa_offset 112
+; BE-NEXT: .cfi_offset lr, 16
+; BE-NEXT: bl llrintf
+; BE-NEXT: nop
+; BE-NEXT: addi r1, r1, 112
+; BE-NEXT: ld r0, 16(r1)
+; BE-NEXT: mtlr r0
+; BE-NEXT: blr
+;
+; CHECK-LABEL: test_constrained_llrintf:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fctid f0, f1
+; CHECK-NEXT: mffprd r3, f0
+; CHECK-NEXT: blr
+entry:
+ %0 = tail call i64 @llvm.experimental.constrained.llrint(float %f, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret i64 %0
+}
+
declare i64 @llvm.llrint.i64.f32(float)
define dso_local i64 @test_lround(double %d) local_unnamed_addr {
@@ -200,18 +273,37 @@ define dso_local i64 @test_lround(double %d) local_unnamed_addr {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: test_lround:
-; FAST: # %bb.0: # %entry
-; FAST-NEXT: xsrdpi f0, f1
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: blr
entry:
%0 = tail call i64 @llvm.lround.i64.f64(double %d)
ret i64 %0
}
+define dso_local i64 @test_constrained_lround(double %d) local_unnamed_addr {
+; BE-LABEL: test_constrained_lround:
+; BE: # %bb.0: # %entry
+; BE-NEXT: mflr r0
+; BE-NEXT: stdu r1, -112(r1)
+; BE-NEXT: std r0, 128(r1)
+; BE-NEXT: .cfi_def_cfa_offset 112
+; BE-NEXT: .cfi_offset lr, 16
+; BE-NEXT: bl lround
+; BE-NEXT: nop
+; BE-NEXT: addi r1, r1, 112
+; BE-NEXT: ld r0, 16(r1)
+; BE-NEXT: mtlr r0
+; BE-NEXT: blr
+;
+; CHECK-LABEL: test_constrained_lround:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xsrdpi f0, f1
+; CHECK-NEXT: fctid f0, f0
+; CHECK-NEXT: mffprd r3, f0
+; CHECK-NEXT: blr
+entry:
+ %0 = tail call i64 @llvm.experimental.constrained.lround(double %d, metadata !"fpexcept.ignore")
+ ret i64 %0
+}
+
declare i64 @llvm.lround.i64.f64(double)
define dso_local i32 @test_lroundi32f64(double %d) local_unnamed_addr {
@@ -242,18 +334,37 @@ define dso_local i32 @test_lroundi32f64(double %d) local_unnamed_addr {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: test_lroundi32f64:
-; FAST: # %bb.0: # %entry
-; FAST-NEXT: xsrdpi f0, f1
-; FAST-NEXT: fctiw f0, f0
-; FAST-NEXT: mffprwz r3, f0
-; FAST-NEXT: blr
entry:
%0 = tail call i32 @llvm.lround.i32.f64(double %d)
ret i32 %0
}
+define dso_local i32 @test_constrained_lroundi32f64(double %d) local_unnamed_addr {
+; BE-LABEL: test_constrained_lroundi32f64:
+; BE: # %bb.0: # %entry
+; BE-NEXT: mflr r0
+; BE-NEXT: stdu r1, -112(r1)
+; BE-NEXT: std r0, 128(r1)
+; BE-NEXT: .cfi_def_cfa_offset 112
+; BE-NEXT: .cfi_offset lr, 16
+; BE-NEXT: bl lround
+; BE-NEXT: nop
+; BE-NEXT: addi r1, r1, 112
+; BE-NEXT: ld r0, 16(r1)
+; BE-NEXT: mtlr r0
+; BE-NEXT: blr
+;
+; CHECK-LABEL: test_constrained_lroundi32f64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xsrdpi f0, f1
+; CHECK-NEXT: fctiw f0, f0
+; CHECK-NEXT: mffprwz r3, f0
+; CHECK-NEXT: blr
+entry:
+ %0 = tail call i32 @llvm.experimental.constrained.lround(double %d, metadata !"fpexcept.ignore")
+ ret i32 %0
+}
+
declare i32 @llvm.lround.i32.f64(double)
define dso_local i64 @test_lroundf(float %f) local_unnamed_addr {
@@ -284,18 +395,37 @@ define dso_local i64 @test_lroundf(float %f) local_unnamed_addr {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: test_lroundf:
-; FAST: # %bb.0: # %entry
-; FAST-NEXT: xsrdpi f0, f1
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: blr
entry:
%0 = tail call i64 @llvm.lround.i64.f32(float %f)
ret i64 %0
}
+define dso_local i64 @test_constrained_lroundf(float %f) local_unnamed_addr {
+; BE-LABEL: test_constrained_lroundf:
+; BE: # %bb.0: # %entry
+; BE-NEXT: mflr r0
+; BE-NEXT: stdu r1, -112(r1)
+; BE-NEXT: std r0, 128(r1)
+; BE-NEXT: .cfi_def_cfa_offset 112
+; BE-NEXT: .cfi_offset lr, 16
+; BE-NEXT: bl lroundf
+; BE-NEXT: nop
+; BE-NEXT: addi r1, r1, 112
+; BE-NEXT: ld r0, 16(r1)
+; BE-NEXT: mtlr r0
+; BE-NEXT: blr
+;
+; CHECK-LABEL: test_constrained_lroundf:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xsrdpi f0, f1
+; CHECK-NEXT: fctid f0, f0
+; CHECK-NEXT: mffprd r3, f0
+; CHECK-NEXT: blr
+entry:
+ %0 = tail call i64 @llvm.experimental.constrained.lround(float %f, metadata !"fpexcept.ignore")
+ ret i64 %0
+}
+
declare i64 @llvm.lround.i64.f32(float)
define dso_local i32 @test_lroundi32f32(float %d) local_unnamed_addr {
@@ -326,18 +456,37 @@ define dso_local i32 @test_lroundi32f32(float %d) local_unnamed_addr {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: test_lroundi32f32:
-; FAST: # %bb.0: # %entry
-; FAST-NEXT: xsrdpi f0, f1
-; FAST-NEXT: fctiw f0, f0
-; FAST-NEXT: mffprwz r3, f0
-; FAST-NEXT: blr
entry:
%0 = tail call i32 @llvm.lround.i32.f32(float %d)
ret i32 %0
}
+define dso_local i32 @test_constrained_lroundi32f32(float %f) local_unnamed_addr {
+; BE-LABEL: test_constrained_lroundi32f32:
+; BE: # %bb.0: # %entry
+; BE-NEXT: mflr r0
+; BE-NEXT: stdu r1, -112(r1)
+; BE-NEXT: std r0, 128(r1)
+; BE-NEXT: .cfi_def_cfa_offset 112
+; BE-NEXT: .cfi_offset lr, 16
+; BE-NEXT: bl lroundf
+; BE-NEXT: nop
+; BE-NEXT: addi r1, r1, 112
+; BE-NEXT: ld r0, 16(r1)
+; BE-NEXT: mtlr r0
+; BE-NEXT: blr
+;
+; CHECK-LABEL: test_constrained_lroundi32f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xsrdpi f0, f1
+; CHECK-NEXT: fctiw f0, f0
+; CHECK-NEXT: mffprwz r3, f0
+; CHECK-NEXT: blr
+entry:
+ %0 = tail call i32 @llvm.experimental.constrained.lround(float %f, metadata !"fpexcept.ignore")
+ ret i32 %0
+}
+
declare i32 @llvm.lround.i32.f32(float)
define dso_local i64 @test_llround(double %d) local_unnamed_addr {
@@ -368,18 +517,37 @@ define dso_local i64 @test_llround(double %d) local_unnamed_addr {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: test_llround:
-; FAST: # %bb.0: # %entry
-; FAST-NEXT: xsrdpi f0, f1
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: blr
entry:
%0 = tail call i64 @llvm.llround.i64.f64(double %d)
ret i64 %0
}
+define dso_local i64 @test_constrained_llround(double %d) local_unnamed_addr {
+; BE-LABEL: test_constrained_llround:
+; BE: # %bb.0: # %entry
+; BE-NEXT: mflr r0
+; BE-NEXT: stdu r1, -112(r1)
+; BE-NEXT: std r0, 128(r1)
+; BE-NEXT: .cfi_def_cfa_offset 112
+; BE-NEXT: .cfi_offset lr, 16
+; BE-NEXT: bl llround
+; BE-NEXT: nop
+; BE-NEXT: addi r1, r1, 112
+; BE-NEXT: ld r0, 16(r1)
+; BE-NEXT: mtlr r0
+; BE-NEXT: blr
+;
+; CHECK-LABEL: test_constrained_llround:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xsrdpi f0, f1
+; CHECK-NEXT: fctid f0, f0
+; CHECK-NEXT: mffprd r3, f0
+; CHECK-NEXT: blr
+entry:
+ %0 = tail call i64 @llvm.experimental.constrained.llround(double %d, metadata !"fpexcept.ignore")
+ ret i64 %0
+}
+
declare i64 @llvm.llround.i64.f64(double)
define dso_local i64 @test_llroundf(float %f) local_unnamed_addr {
@@ -410,18 +578,37 @@ define dso_local i64 @test_llroundf(float %f) local_unnamed_addr {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: test_llroundf:
-; FAST: # %bb.0: # %entry
-; FAST-NEXT: xsrdpi f0, f1
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: blr
entry:
%0 = tail call i64 @llvm.llround.i64.f32(float %f)
ret i64 %0
}
+define dso_local i64 @test_constrained_llroundf(float %f) local_unnamed_addr {
+; BE-LABEL: test_constrained_llroundf:
+; BE: # %bb.0: # %entry
+; BE-NEXT: mflr r0
+; BE-NEXT: stdu r1, -112(r1)
+; BE-NEXT: std r0, 128(r1)
+; BE-NEXT: .cfi_def_cfa_offset 112
+; BE-NEXT: .cfi_offset lr, 16
+; BE-NEXT: bl llroundf
+; BE-NEXT: nop
+; BE-NEXT: addi r1, r1, 112
+; BE-NEXT: ld r0, 16(r1)
+; BE-NEXT: mtlr r0
+; BE-NEXT: blr
+;
+; CHECK-LABEL: test_constrained_llroundf:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xsrdpi f0, f1
+; CHECK-NEXT: fctid f0, f0
+; CHECK-NEXT: mffprd r3, f0
+; CHECK-NEXT: blr
+entry:
+ %0 = tail call i64 @llvm.experimental.constrained.llround(float %f, metadata !"fpexcept.ignore")
+ ret i64 %0
+}
+
declare i64 @llvm.llround.i64.f32(float)
define dso_local double @test_nearbyint(double %d) local_unnamed_addr {
@@ -452,16 +639,26 @@ define dso_local double @test_nearbyint(double %d) local_unnamed_addr {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: test_nearbyint:
-; FAST: # %bb.0: # %entry
-; FAST-NEXT: xsrdpic f1, f1
-; FAST-NEXT: blr
entry:
%0 = tail call double @llvm.nearbyint.f64(double %d)
ret double %0
}
+define dso_local double @test_constrained_nearbyint(double %d) local_unnamed_addr {
+; BE-LABEL: test_constrained_nearbyint:
+; BE: # %bb.0: # %entry
+; BE-NEXT: xsrdpic f1, f1
+; BE-NEXT: blr
+;
+; CHECK-LABEL: test_constrained_nearbyint:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xsrdpic f1, f1
+; CHECK-NEXT: blr
+entry:
+ %0 = tail call double @llvm.experimental.constrained.nearbyint(double %d, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret double %0
+}
+
declare double @llvm.nearbyint.f64(double)
define dso_local float @test_nearbyintf(float %f) local_unnamed_addr {
@@ -492,16 +689,26 @@ define dso_local float @test_nearbyintf(float %f) local_unnamed_addr {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: test_nearbyintf:
-; FAST: # %bb.0: # %entry
-; FAST-NEXT: xsrdpic f1, f1
-; FAST-NEXT: blr
entry:
%0 = tail call float @llvm.nearbyint.f32(float %f)
ret float %0
}
+define dso_local float @test_constrained_nearbyintf(float %f) local_unnamed_addr {
+; BE-LABEL: test_constrained_nearbyintf:
+; BE: # %bb.0: # %entry
+; BE-NEXT: xsrdpic f1, f1
+; BE-NEXT: blr
+;
+; CHECK-LABEL: test_constrained_nearbyintf:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xsrdpic f1, f1
+; CHECK-NEXT: blr
+entry:
+ %0 = tail call float @llvm.experimental.constrained.nearbyint(float %f, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret float %0
+}
+
declare float @llvm.nearbyint.f32(float)
define dso_local double @test_round(double %d) local_unnamed_addr {
@@ -514,16 +721,26 @@ define dso_local double @test_round(double %d) local_unnamed_addr {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xsrdpi f1, f1
; CHECK-NEXT: blr
-;
-; FAST-LABEL: test_round:
-; FAST: # %bb.0: # %entry
-; FAST-NEXT: xsrdpi f1, f1
-; FAST-NEXT: blr
entry:
%0 = tail call double @llvm.round.f64(double %d)
ret double %0
}
+define dso_local double @test_constrained_round(double %d) local_unnamed_addr {
+; BE-LABEL: test_constrained_round:
+; BE: # %bb.0: # %entry
+; BE-NEXT: xsrdpi f1, f1
+; BE-NEXT: blr
+;
+; CHECK-LABEL: test_constrained_round:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xsrdpi f1, f1
+; CHECK-NEXT: blr
+entry:
+ %0 = tail call double @llvm.experimental.constrained.round(double %d, metadata !"fpexcept.ignore")
+ ret double %0
+}
+
declare double @llvm.round.f64(double)
define dso_local float @test_roundf(float %f) local_unnamed_addr {
@@ -536,16 +753,26 @@ define dso_local float @test_roundf(float %f) local_unnamed_addr {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xsrdpi f1, f1
; CHECK-NEXT: blr
-;
-; FAST-LABEL: test_roundf:
-; FAST: # %bb.0: # %entry
-; FAST-NEXT: xsrdpi f1, f1
-; FAST-NEXT: blr
entry:
%0 = tail call float @llvm.round.f32(float %f)
ret float %0
}
+define dso_local float @test_constrained_roundf(float %f) local_unnamed_addr {
+; BE-LABEL: test_constrained_roundf:
+; BE: # %bb.0: # %entry
+; BE-NEXT: xsrdpi f1, f1
+; BE-NEXT: blr
+;
+; CHECK-LABEL: test_constrained_roundf:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xsrdpi f1, f1
+; CHECK-NEXT: blr
+entry:
+ %0 = tail call float @llvm.experimental.constrained.round(float %f, metadata !"fpexcept.ignore")
+ ret float %0
+}
+
declare float @llvm.round.f32(float)
define dso_local double @test_trunc(double %d) local_unnamed_addr {
@@ -558,11 +785,6 @@ define dso_local double @test_trunc(double %d) local_unnamed_addr {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xsrdpiz f1, f1
; CHECK-NEXT: blr
-;
-; FAST-LABEL: test_trunc:
-; FAST: # %bb.0: # %entry
-; FAST-NEXT: xsrdpiz f1, f1
-; FAST-NEXT: blr
entry:
%0 = tail call double @llvm.trunc.f64(double %d)
ret double %0
@@ -580,11 +802,6 @@ define dso_local float @test_truncf(float %f) local_unnamed_addr {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xsrdpiz f1, f1
; CHECK-NEXT: blr
-;
-; FAST-LABEL: test_truncf:
-; FAST: # %bb.0: # %entry
-; FAST-NEXT: xsrdpiz f1, f1
-; FAST-NEXT: blr
entry:
%0 = tail call float @llvm.trunc.f32(float %f)
ret float %0
@@ -602,11 +819,6 @@ define dso_local double @test_floor(double %d) local_unnamed_addr {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xsrdpim f1, f1
; CHECK-NEXT: blr
-;
-; FAST-LABEL: test_floor:
-; FAST: # %bb.0: # %entry
-; FAST-NEXT: xsrdpim f1, f1
-; FAST-NEXT: blr
entry:
%0 = tail call double @llvm.floor.f64(double %d)
ret double %0
@@ -624,11 +836,6 @@ define dso_local float @test_floorf(float %f) local_unnamed_addr {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xsrdpim f1, f1
; CHECK-NEXT: blr
-;
-; FAST-LABEL: test_floorf:
-; FAST: # %bb.0: # %entry
-; FAST-NEXT: xsrdpim f1, f1
-; FAST-NEXT: blr
entry:
%0 = tail call float @llvm.floor.f32(float %f)
ret float %0
@@ -646,11 +853,6 @@ define dso_local double @test_ceil(double %d) local_unnamed_addr {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xsrdpip f1, f1
; CHECK-NEXT: blr
-;
-; FAST-LABEL: test_ceil:
-; FAST: # %bb.0: # %entry
-; FAST-NEXT: xsrdpip f1, f1
-; FAST-NEXT: blr
entry:
%0 = tail call double @llvm.ceil.f64(double %d)
ret double %0
@@ -668,11 +870,6 @@ define dso_local float @test_ceilf(float %f) local_unnamed_addr {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xsrdpip f1, f1
; CHECK-NEXT: blr
-;
-; FAST-LABEL: test_ceilf:
-; FAST: # %bb.0: # %entry
-; FAST-NEXT: xsrdpip f1, f1
-; FAST-NEXT: blr
entry:
%0 = tail call float @llvm.ceil.f32(float %f)
ret float %0
@@ -690,11 +887,6 @@ define dso_local double @test_rint(double %d) local_unnamed_addr {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xsrdpic f1, f1
; CHECK-NEXT: blr
-;
-; FAST-LABEL: test_rint:
-; FAST: # %bb.0: # %entry
-; FAST-NEXT: xsrdpic f1, f1
-; FAST-NEXT: blr
entry:
%0 = tail call double @llvm.rint.f64(double %d)
ret double %0
@@ -712,11 +904,6 @@ define dso_local float @test_rintf(float %f) local_unnamed_addr {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xsrdpic f1, f1
; CHECK-NEXT: blr
-;
-; FAST-LABEL: test_rintf:
-; FAST: # %bb.0: # %entry
-; FAST-NEXT: xsrdpic f1, f1
-; FAST-NEXT: blr
entry:
%0 = tail call float @llvm.rint.f32(float %f)
ret float %0
diff --git a/llvm/test/CodeGen/PowerPC/sign-ext-atomics.ll b/llvm/test/CodeGen/PowerPC/sign-ext-atomics.ll
index 128d546..da4c192 100644
--- a/llvm/test/CodeGen/PowerPC/sign-ext-atomics.ll
+++ b/llvm/test/CodeGen/PowerPC/sign-ext-atomics.ll
@@ -16,7 +16,7 @@ define i16 @SEXTParam(i16 signext %0) #0 {
; CHECK-NEXT: # %bb.2: # %top
; CHECK-NEXT: #
; CHECK-NEXT: sthcx. 3, 0, 4
-; CHECK-NEXT: bne 0, .LBB0_1
+; CHECK-NEXT: bne- 0, .LBB0_1
; CHECK-NEXT: .LBB0_3: # %top
; CHECK-NEXT: lwsync
; CHECK-NEXT: lhz 3, -4(1)
@@ -49,7 +49,7 @@ define i16 @noSEXTParam(i16 %0) #0 {
; CHECK-NEXT: # %bb.2: # %top
; CHECK-NEXT: #
; CHECK-NEXT: sthcx. 3, 0, 4
-; CHECK-NEXT: bne 0, .LBB1_1
+; CHECK-NEXT: bne- 0, .LBB1_1
; CHECK-NEXT: .LBB1_3: # %top
; CHECK-NEXT: lwsync
; CHECK-NEXT: lhz 3, -4(1)
@@ -82,7 +82,7 @@ define i16 @noSEXTLoad(ptr %p) #0 {
; CHECK-NEXT: # %bb.2: # %top
; CHECK-NEXT: #
; CHECK-NEXT: sthcx. 3, 0, 4
-; CHECK-NEXT: bne 0, .LBB2_1
+; CHECK-NEXT: bne- 0, .LBB2_1
; CHECK-NEXT: .LBB2_3: # %top
; CHECK-NEXT: lwsync
; CHECK-NEXT: lhz 3, -4(1)
diff --git a/llvm/test/CodeGen/PowerPC/vector-llrint.ll b/llvm/test/CodeGen/PowerPC/vector-llrint.ll
index 8a9e48e..d2fb6ca 100644
--- a/llvm/test/CodeGen/PowerPC/vector-llrint.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-llrint.ll
@@ -9,9 +9,6 @@
; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
; RUN: -mtriple=powerpc64le-unknown-unknown -verify-machineinstrs < %s | \
; RUN: FileCheck %s
-; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
-; RUN: -mtriple=powerpc64le-unknown-unknown -verify-machineinstrs < %s \
-; RUN: --enable-unsafe-fp-math | FileCheck %s --check-prefix=FAST
define <1 x i64> @llrint_v1i64_v1f16(<1 x half> %x) nounwind {
; BE-LABEL: llrint_v1i64_v1f16:
@@ -47,23 +44,6 @@ define <1 x i64> @llrint_v1i64_v1f16(<1 x half> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: llrint_v1i64_v1f16:
-; FAST: # %bb.0:
-; FAST-NEXT: mflr r0
-; FAST-NEXT: stdu r1, -32(r1)
-; FAST-NEXT: std r0, 48(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fctid f0, f1
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: addi r1, r1, 32
-; FAST-NEXT: ld r0, 16(r1)
-; FAST-NEXT: mtlr r0
-; FAST-NEXT: blr
%a = call <1 x i64> @llvm.llrint.v1i64.v1f16(<1 x half> %x)
ret <1 x i64> %a
}
@@ -147,41 +127,6 @@ define <2 x i64> @llrint_v1i64_v2f16(<2 x half> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: llrint_v1i64_v2f16:
-; FAST: # %bb.0:
-; FAST-NEXT: mflr r0
-; FAST-NEXT: stfd f30, -16(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f31, -8(r1) # 8-byte Folded Spill
-; FAST-NEXT: stdu r1, -48(r1)
-; FAST-NEXT: fmr f31, f1
-; FAST-NEXT: fmr f1, f2
-; FAST-NEXT: std r0, 64(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f30, f1
-; FAST-NEXT: fmr f1, f31
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fctid f0, f1
-; FAST-NEXT: fctid f1, f30
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: mffprd r3, f1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: xxmrghd v2, vs1, vs0
-; FAST-NEXT: addi r1, r1, 48
-; FAST-NEXT: ld r0, 16(r1)
-; FAST-NEXT: lfd f31, -8(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f30, -16(r1) # 8-byte Folded Reload
-; FAST-NEXT: mtlr r0
-; FAST-NEXT: blr
%a = call <2 x i64> @llvm.llrint.v2i64.v2f16(<2 x half> %x)
ret <2 x i64> %a
}
@@ -341,68 +286,6 @@ define <4 x i64> @llrint_v4i64_v4f16(<4 x half> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: llrint_v4i64_v4f16:
-; FAST: # %bb.0:
-; FAST-NEXT: mflr r0
-; FAST-NEXT: stfd f28, -32(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f29, -24(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f30, -16(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f31, -8(r1) # 8-byte Folded Spill
-; FAST-NEXT: stdu r1, -64(r1)
-; FAST-NEXT: fmr f29, f1
-; FAST-NEXT: fmr f1, f4
-; FAST-NEXT: std r0, 80(r1)
-; FAST-NEXT: fmr f31, f3
-; FAST-NEXT: fmr f30, f2
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f28, f1
-; FAST-NEXT: fmr f1, f31
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f31, f1
-; FAST-NEXT: fmr f1, f30
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f30, f1
-; FAST-NEXT: fmr f1, f29
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fctid f0, f30
-; FAST-NEXT: fctid f2, f31
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: fctid f1, f1
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: mffprd r3, f2
-; FAST-NEXT: mtfprd f2, r3
-; FAST-NEXT: mffprd r3, f1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: xxmrghd v2, vs0, vs1
-; FAST-NEXT: fctid f0, f28
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v3, vs0, vs2
-; FAST-NEXT: addi r1, r1, 64
-; FAST-NEXT: ld r0, 16(r1)
-; FAST-NEXT: lfd f31, -8(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f30, -16(r1) # 8-byte Folded Reload
-; FAST-NEXT: mtlr r0
-; FAST-NEXT: lfd f29, -24(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f28, -32(r1) # 8-byte Folded Reload
-; FAST-NEXT: blr
%a = call <4 x i64> @llvm.llrint.v4i64.v4f16(<4 x half> %x)
ret <4 x i64> %a
}
@@ -714,122 +597,6 @@ define <8 x i64> @llrint_v8i64_v8f16(<8 x half> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: llrint_v8i64_v8f16:
-; FAST: # %bb.0:
-; FAST-NEXT: mflr r0
-; FAST-NEXT: stfd f24, -64(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f25, -56(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f26, -48(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f27, -40(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f28, -32(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f29, -24(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f30, -16(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f31, -8(r1) # 8-byte Folded Spill
-; FAST-NEXT: stdu r1, -96(r1)
-; FAST-NEXT: fmr f24, f1
-; FAST-NEXT: fmr f1, f8
-; FAST-NEXT: std r0, 112(r1)
-; FAST-NEXT: fmr f30, f7
-; FAST-NEXT: fmr f29, f6
-; FAST-NEXT: fmr f28, f5
-; FAST-NEXT: fmr f27, f4
-; FAST-NEXT: fmr f26, f3
-; FAST-NEXT: fmr f25, f2
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f31, f1
-; FAST-NEXT: fmr f1, f30
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f30, f1
-; FAST-NEXT: fmr f1, f29
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f29, f1
-; FAST-NEXT: fmr f1, f28
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f28, f1
-; FAST-NEXT: fmr f1, f27
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f27, f1
-; FAST-NEXT: fmr f1, f26
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f26, f1
-; FAST-NEXT: fmr f1, f25
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f25, f1
-; FAST-NEXT: fmr f1, f24
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fctid f0, f25
-; FAST-NEXT: fctid f2, f26
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: fctid f3, f27
-; FAST-NEXT: fctid f4, f28
-; FAST-NEXT: fctid f5, f29
-; FAST-NEXT: fctid f6, f30
-; FAST-NEXT: fctid f1, f1
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: mffprd r3, f2
-; FAST-NEXT: mtfprd f2, r3
-; FAST-NEXT: mffprd r3, f3
-; FAST-NEXT: mtfprd f3, r3
-; FAST-NEXT: mffprd r3, f4
-; FAST-NEXT: mtfprd f4, r3
-; FAST-NEXT: mffprd r3, f5
-; FAST-NEXT: mtfprd f5, r3
-; FAST-NEXT: mffprd r3, f6
-; FAST-NEXT: mtfprd f6, r3
-; FAST-NEXT: mffprd r3, f1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: xxmrghd v3, vs3, vs2
-; FAST-NEXT: xxmrghd v4, vs5, vs4
-; FAST-NEXT: xxmrghd v2, vs0, vs1
-; FAST-NEXT: fctid f0, f31
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v5, vs0, vs6
-; FAST-NEXT: addi r1, r1, 96
-; FAST-NEXT: ld r0, 16(r1)
-; FAST-NEXT: lfd f31, -8(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f30, -16(r1) # 8-byte Folded Reload
-; FAST-NEXT: mtlr r0
-; FAST-NEXT: lfd f29, -24(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f28, -32(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f27, -40(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f26, -48(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f25, -56(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f24, -64(r1) # 8-byte Folded Reload
-; FAST-NEXT: blr
%a = call <8 x i64> @llvm.llrint.v8i64.v8f16(<8 x half> %x)
ret <8 x i64> %a
}
@@ -1439,228 +1206,6 @@ define <16 x i64> @llrint_v16i64_v16f16(<16 x half> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: llrint_v16i64_v16f16:
-; FAST: # %bb.0:
-; FAST-NEXT: mflr r0
-; FAST-NEXT: stfd f16, -128(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f17, -120(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f18, -112(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f19, -104(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f20, -96(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f21, -88(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f22, -80(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f23, -72(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f24, -64(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f25, -56(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f26, -48(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f27, -40(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f28, -32(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f29, -24(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f30, -16(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f31, -8(r1) # 8-byte Folded Spill
-; FAST-NEXT: stdu r1, -160(r1)
-; FAST-NEXT: fmr f26, f1
-; FAST-NEXT: lfs f1, 312(r1)
-; FAST-NEXT: std r0, 176(r1)
-; FAST-NEXT: fmr f28, f13
-; FAST-NEXT: fmr f27, f12
-; FAST-NEXT: fmr f24, f11
-; FAST-NEXT: fmr f21, f10
-; FAST-NEXT: fmr f19, f9
-; FAST-NEXT: fmr f18, f8
-; FAST-NEXT: fmr f17, f7
-; FAST-NEXT: fmr f16, f6
-; FAST-NEXT: fmr f20, f5
-; FAST-NEXT: fmr f22, f4
-; FAST-NEXT: fmr f23, f3
-; FAST-NEXT: fmr f25, f2
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f31, f1
-; FAST-NEXT: lfs f1, 304(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f30, f1
-; FAST-NEXT: lfs f1, 296(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f29, f1
-; FAST-NEXT: fmr f1, f28
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f28, f1
-; FAST-NEXT: fmr f1, f27
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f27, f1
-; FAST-NEXT: fmr f1, f24
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f24, f1
-; FAST-NEXT: fmr f1, f21
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f21, f1
-; FAST-NEXT: fmr f1, f19
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f19, f1
-; FAST-NEXT: fmr f1, f18
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f18, f1
-; FAST-NEXT: fmr f1, f17
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f17, f1
-; FAST-NEXT: fmr f1, f16
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f16, f1
-; FAST-NEXT: fmr f1, f20
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f20, f1
-; FAST-NEXT: fmr f1, f22
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f22, f1
-; FAST-NEXT: fmr f1, f23
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f23, f1
-; FAST-NEXT: fmr f1, f25
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f25, f1
-; FAST-NEXT: fmr f1, f26
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fctid f0, f25
-; FAST-NEXT: fctid f2, f23
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: fctid f3, f22
-; FAST-NEXT: fctid f4, f20
-; FAST-NEXT: fctid f5, f16
-; FAST-NEXT: fctid f6, f17
-; FAST-NEXT: fctid f7, f18
-; FAST-NEXT: fctid f8, f19
-; FAST-NEXT: fctid f9, f21
-; FAST-NEXT: fctid f10, f24
-; FAST-NEXT: fctid f1, f1
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: mffprd r3, f2
-; FAST-NEXT: mtfprd f2, r3
-; FAST-NEXT: mffprd r3, f3
-; FAST-NEXT: mtfprd f3, r3
-; FAST-NEXT: mffprd r3, f4
-; FAST-NEXT: mtfprd f4, r3
-; FAST-NEXT: mffprd r3, f5
-; FAST-NEXT: mtfprd f5, r3
-; FAST-NEXT: mffprd r3, f6
-; FAST-NEXT: mtfprd f6, r3
-; FAST-NEXT: mffprd r3, f7
-; FAST-NEXT: mtfprd f7, r3
-; FAST-NEXT: mffprd r3, f8
-; FAST-NEXT: mtfprd f8, r3
-; FAST-NEXT: mffprd r3, f9
-; FAST-NEXT: mtfprd f9, r3
-; FAST-NEXT: mffprd r3, f10
-; FAST-NEXT: mtfprd f10, r3
-; FAST-NEXT: mffprd r3, f1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: xxmrghd v3, vs3, vs2
-; FAST-NEXT: xxmrghd v4, vs5, vs4
-; FAST-NEXT: xxmrghd v5, vs7, vs6
-; FAST-NEXT: xxmrghd v6, vs9, vs8
-; FAST-NEXT: xxmrghd v2, vs0, vs1
-; FAST-NEXT: fctid f0, f27
-; FAST-NEXT: fctid f1, f29
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v7, vs0, vs10
-; FAST-NEXT: fctid f0, f28
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: mffprd r3, f1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: xxmrghd v8, vs1, vs0
-; FAST-NEXT: fctid f0, f30
-; FAST-NEXT: fctid f1, f31
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: mffprd r3, f1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: xxmrghd v9, vs1, vs0
-; FAST-NEXT: addi r1, r1, 160
-; FAST-NEXT: ld r0, 16(r1)
-; FAST-NEXT: lfd f31, -8(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f30, -16(r1) # 8-byte Folded Reload
-; FAST-NEXT: mtlr r0
-; FAST-NEXT: lfd f29, -24(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f28, -32(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f27, -40(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f26, -48(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f25, -56(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f24, -64(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f23, -72(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f22, -80(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f21, -88(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f20, -96(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f19, -104(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f18, -112(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f17, -120(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f16, -128(r1) # 8-byte Folded Reload
-; FAST-NEXT: blr
%a = call <16 x i64> @llvm.llrint.v16i64.v16f16(<16 x half> %x)
ret <16 x i64> %a
}
@@ -2839,523 +2384,6 @@ define <32 x i64> @llrint_v32i64_v32f16(<32 x half> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: llrint_v32i64_v32f16:
-; FAST: # %bb.0:
-; FAST-NEXT: mflr r0
-; FAST-NEXT: stdu r1, -480(r1)
-; FAST-NEXT: li r4, 128
-; FAST-NEXT: std r0, 496(r1)
-; FAST-NEXT: std r30, 320(r1) # 8-byte Folded Spill
-; FAST-NEXT: mr r30, r3
-; FAST-NEXT: stfd f14, 336(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f15, 344(r1) # 8-byte Folded Spill
-; FAST-NEXT: fmr f14, f5
-; FAST-NEXT: stfd f16, 352(r1) # 8-byte Folded Spill
-; FAST-NEXT: stxvd2x v20, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT: li r4, 144
-; FAST-NEXT: fmr f16, f4
-; FAST-NEXT: stfd f17, 360(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f18, 368(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f19, 376(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f20, 384(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f21, 392(r1) # 8-byte Folded Spill
-; FAST-NEXT: stxvd2x v21, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT: li r4, 160
-; FAST-NEXT: stfd f22, 400(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f23, 408(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f24, 416(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f25, 424(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f26, 432(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f27, 440(r1) # 8-byte Folded Spill
-; FAST-NEXT: stxvd2x v22, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT: li r4, 176
-; FAST-NEXT: xxlor v22, f3, f3
-; FAST-NEXT: stfd f28, 448(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f29, 456(r1) # 8-byte Folded Spill
-; FAST-NEXT: fmr f29, f9
-; FAST-NEXT: stfd f30, 464(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f31, 472(r1) # 8-byte Folded Spill
-; FAST-NEXT: stxvd2x v23, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT: li r4, 192
-; FAST-NEXT: xxlor v23, f2, f2
-; FAST-NEXT: stxvd2x v24, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT: li r4, 208
-; FAST-NEXT: stxvd2x v25, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT: li r4, 224
-; FAST-NEXT: xxlor v25, f13, f13
-; FAST-NEXT: stxvd2x v26, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT: li r4, 240
-; FAST-NEXT: xxlor v26, f12, f12
-; FAST-NEXT: stxvd2x v27, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT: li r4, 256
-; FAST-NEXT: xxlor v27, f11, f11
-; FAST-NEXT: stxvd2x v28, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT: li r4, 272
-; FAST-NEXT: xxlor v28, f10, f10
-; FAST-NEXT: stxvd2x v29, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT: li r4, 288
-; FAST-NEXT: xxlor v29, f8, f8
-; FAST-NEXT: stxvd2x v30, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT: li r4, 304
-; FAST-NEXT: xxlor v30, f7, f7
-; FAST-NEXT: stxvd2x v31, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT: li r4, 44
-; FAST-NEXT: xxlor v31, f6, f6
-; FAST-NEXT: stxsspx f1, r1, r4 # 4-byte Folded Spill
-; FAST-NEXT: lfs f1, 768(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: li r3, 120
-; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT: lfs f1, 760(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: li r3, 112
-; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT: lfs f1, 752(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: li r3, 104
-; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT: lfs f1, 744(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: li r3, 96
-; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT: lfs f1, 736(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: li r3, 88
-; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT: lfs f1, 728(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: li r3, 80
-; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT: lfs f1, 720(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: li r3, 72
-; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT: lfs f1, 712(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: li r3, 64
-; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT: lfs f1, 704(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: li r3, 56
-; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT: lfs f1, 696(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: li r3, 48
-; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT: lfs f1, 688(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: xxlor v21, f1, f1
-; FAST-NEXT: lfs f1, 680(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: xxlor v20, f1, f1
-; FAST-NEXT: lfs f1, 672(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: xxlor v24, f1, f1
-; FAST-NEXT: lfs f1, 664(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f31, f1
-; FAST-NEXT: lfs f1, 656(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f30, f1
-; FAST-NEXT: lfs f1, 648(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f28, f1
-; FAST-NEXT: lfs f1, 640(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f27, f1
-; FAST-NEXT: lfs f1, 632(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f26, f1
-; FAST-NEXT: lfs f1, 624(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f25, f1
-; FAST-NEXT: xxlor f1, v25, v25
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f24, f1
-; FAST-NEXT: xxlor f1, v26, v26
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f23, f1
-; FAST-NEXT: xxlor f1, v27, v27
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f22, f1
-; FAST-NEXT: xxlor f1, v28, v28
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f21, f1
-; FAST-NEXT: fmr f1, f29
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f20, f1
-; FAST-NEXT: xxlor f1, v29, v29
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f19, f1
-; FAST-NEXT: xxlor f1, v30, v30
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f18, f1
-; FAST-NEXT: xxlor f1, v31, v31
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f29, f1
-; FAST-NEXT: fmr f1, f14
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f14, f1
-; FAST-NEXT: fmr f1, f16
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f16, f1
-; FAST-NEXT: xxlor f1, v22, v22
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f17, f1
-; FAST-NEXT: xxlor f1, v23, v23
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: li r3, 44
-; FAST-NEXT: fmr f15, f1
-; FAST-NEXT: lxsspx f1, r1, r3 # 4-byte Folded Reload
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fctid f3, f15
-; FAST-NEXT: fctid f4, f17
-; FAST-NEXT: mffprd r3, f3
-; FAST-NEXT: fctid f5, f16
-; FAST-NEXT: fctid f6, f14
-; FAST-NEXT: fctid f7, f18
-; FAST-NEXT: fctid f8, f19
-; FAST-NEXT: fctid f13, f1
-; FAST-NEXT: fctid f9, f20
-; FAST-NEXT: fctid f10, f22
-; FAST-NEXT: fctid f11, f24
-; FAST-NEXT: fctid f12, f25
-; FAST-NEXT: fctid f2, f23
-; FAST-NEXT: fctid f0, f21
-; FAST-NEXT: mtvsrd v2, r3
-; FAST-NEXT: mffprd r3, f4
-; FAST-NEXT: mtvsrd v3, r3
-; FAST-NEXT: mffprd r3, f5
-; FAST-NEXT: mtfprd f5, r3
-; FAST-NEXT: mffprd r3, f6
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: mffprd r3, f7
-; FAST-NEXT: mtfprd f6, r3
-; FAST-NEXT: mffprd r3, f8
-; FAST-NEXT: mtfprd f7, r3
-; FAST-NEXT: mffprd r3, f9
-; FAST-NEXT: mtfprd f3, r3
-; FAST-NEXT: mffprd r3, f10
-; FAST-NEXT: mtfprd f4, r3
-; FAST-NEXT: mffprd r3, f11
-; FAST-NEXT: fctid f11, f31
-; FAST-NEXT: lfd f31, 56(r1) # 8-byte Folded Reload
-; FAST-NEXT: mtfprd f8, r3
-; FAST-NEXT: mffprd r3, f12
-; FAST-NEXT: xxlor f12, v24, v24
-; FAST-NEXT: fctid f31, f31
-; FAST-NEXT: fctid f12, f12
-; FAST-NEXT: mtfprd f9, r3
-; FAST-NEXT: mffprd r3, f13
-; FAST-NEXT: lfd f13, 48(r1) # 8-byte Folded Reload
-; FAST-NEXT: mtfprd f10, r3
-; FAST-NEXT: fctid f13, f13
-; FAST-NEXT: xxmrghd v3, vs5, v3
-; FAST-NEXT: fctid f5, f26
-; FAST-NEXT: mffprd r3, f5
-; FAST-NEXT: mtfprd f5, r3
-; FAST-NEXT: xxmrghd v4, vs7, vs6
-; FAST-NEXT: fctid f6, f27
-; FAST-NEXT: fctid f7, f28
-; FAST-NEXT: mffprd r3, f6
-; FAST-NEXT: lfd f28, 96(r1) # 8-byte Folded Reload
-; FAST-NEXT: fctid f28, f28
-; FAST-NEXT: mtfprd f6, r3
-; FAST-NEXT: mffprd r3, f7
-; FAST-NEXT: mtfprd f7, r3
-; FAST-NEXT: xxmrghd v2, v2, vs10
-; FAST-NEXT: fctid f10, f30
-; FAST-NEXT: mffprd r3, f10
-; FAST-NEXT: lfd f30, 80(r1) # 8-byte Folded Reload
-; FAST-NEXT: fctid f30, f30
-; FAST-NEXT: mtfprd f10, r3
-; FAST-NEXT: mffprd r3, f11
-; FAST-NEXT: mtfprd f11, r3
-; FAST-NEXT: mffprd r3, f12
-; FAST-NEXT: mtfprd f12, r3
-; FAST-NEXT: xxmrghd v5, vs12, vs11
-; FAST-NEXT: xxlor f11, v20, v20
-; FAST-NEXT: xxlor f12, v21, v21
-; FAST-NEXT: fctid f11, f11
-; FAST-NEXT: fctid f12, f12
-; FAST-NEXT: mffprd r3, f11
-; FAST-NEXT: mtfprd f11, r3
-; FAST-NEXT: mffprd r3, f12
-; FAST-NEXT: mtfprd f12, r3
-; FAST-NEXT: mffprd r3, f13
-; FAST-NEXT: mtfprd f13, r3
-; FAST-NEXT: mffprd r3, f31
-; FAST-NEXT: lfd f31, 64(r1) # 8-byte Folded Reload
-; FAST-NEXT: fctid f31, f31
-; FAST-NEXT: mtvsrd v0, r3
-; FAST-NEXT: mffprd r3, f31
-; FAST-NEXT: lfd f31, 72(r1) # 8-byte Folded Reload
-; FAST-NEXT: mtvsrd v1, r3
-; FAST-NEXT: mffprd r3, f30
-; FAST-NEXT: lfd f30, 88(r1) # 8-byte Folded Reload
-; FAST-NEXT: fctid f31, f31
-; FAST-NEXT: mtvsrd v6, r3
-; FAST-NEXT: mffprd r3, f28
-; FAST-NEXT: lfd f28, 104(r1) # 8-byte Folded Reload
-; FAST-NEXT: fctid f30, f30
-; FAST-NEXT: fctid f28, f28
-; FAST-NEXT: mtvsrd v7, r3
-; FAST-NEXT: mffprd r3, f28
-; FAST-NEXT: lfd f28, 112(r1) # 8-byte Folded Reload
-; FAST-NEXT: fctid f28, f28
-; FAST-NEXT: mtvsrd v8, r3
-; FAST-NEXT: mffprd r3, f28
-; FAST-NEXT: lfd f28, 120(r1) # 8-byte Folded Reload
-; FAST-NEXT: fctid f28, f28
-; FAST-NEXT: xxmrghd v10, vs12, vs11
-; FAST-NEXT: xxmrghd v0, v0, vs13
-; FAST-NEXT: xxswapd vs12, v0
-; FAST-NEXT: xxmrghd v0, vs9, vs8
-; FAST-NEXT: xxmrghd v7, v8, v7
-; FAST-NEXT: mtvsrd v8, r3
-; FAST-NEXT: mffprd r3, f28
-; FAST-NEXT: mtvsrd v9, r3
-; FAST-NEXT: mffprd r3, f30
-; FAST-NEXT: xxswapd v7, v7
-; FAST-NEXT: xxmrghd v8, v9, v8
-; FAST-NEXT: mtvsrd v9, r3
-; FAST-NEXT: mffprd r3, f31
-; FAST-NEXT: xxswapd v8, v8
-; FAST-NEXT: xxmrghd v6, v9, v6
-; FAST-NEXT: mtvsrd v9, r3
-; FAST-NEXT: li r3, 240
-; FAST-NEXT: stxvd2x v8, r30, r3
-; FAST-NEXT: li r3, 224
-; FAST-NEXT: stxvd2x v7, r30, r3
-; FAST-NEXT: li r3, 208
-; FAST-NEXT: xxswapd vs11, v6
-; FAST-NEXT: xxmrghd v6, vs10, vs7
-; FAST-NEXT: stxvd2x vs11, r30, r3
-; FAST-NEXT: li r3, 192
-; FAST-NEXT: xxmrghd v1, v9, v1
-; FAST-NEXT: xxswapd vs11, v1
-; FAST-NEXT: xxmrghd v1, vs6, vs5
-; FAST-NEXT: xxswapd vs5, v10
-; FAST-NEXT: xxswapd vs6, v5
-; FAST-NEXT: stxvd2x vs11, r30, r3
-; FAST-NEXT: li r3, 176
-; FAST-NEXT: stxvd2x vs12, r30, r3
-; FAST-NEXT: li r3, 160
-; FAST-NEXT: stxvd2x vs5, r30, r3
-; FAST-NEXT: li r3, 144
-; FAST-NEXT: stxvd2x vs6, r30, r3
-; FAST-NEXT: mffprd r3, f2
-; FAST-NEXT: mtfprd f7, r3
-; FAST-NEXT: li r3, 128
-; FAST-NEXT: xxswapd vs5, v6
-; FAST-NEXT: stxvd2x vs5, r30, r3
-; FAST-NEXT: li r3, 112
-; FAST-NEXT: xxswapd vs2, v1
-; FAST-NEXT: xxswapd vs6, v0
-; FAST-NEXT: stxvd2x vs2, r30, r3
-; FAST-NEXT: li r3, 96
-; FAST-NEXT: fctid f2, f29
-; FAST-NEXT: stxvd2x vs6, r30, r3
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: mffprd r3, f2
-; FAST-NEXT: mtfprd f2, r3
-; FAST-NEXT: li r3, 80
-; FAST-NEXT: xxmrghd v5, vs7, vs4
-; FAST-NEXT: xxswapd vs4, v2
-; FAST-NEXT: xxmrghd v0, vs0, vs3
-; FAST-NEXT: xxswapd vs0, v5
-; FAST-NEXT: xxswapd vs3, v3
-; FAST-NEXT: stxvd2x vs0, r30, r3
-; FAST-NEXT: li r3, 64
-; FAST-NEXT: xxswapd vs0, v0
-; FAST-NEXT: stxvd2x vs0, r30, r3
-; FAST-NEXT: li r3, 48
-; FAST-NEXT: xxmrghd v5, vs2, vs1
-; FAST-NEXT: xxswapd vs1, v4
-; FAST-NEXT: stxvd2x vs1, r30, r3
-; FAST-NEXT: li r3, 32
-; FAST-NEXT: xxswapd vs2, v5
-; FAST-NEXT: stxvd2x vs2, r30, r3
-; FAST-NEXT: li r3, 16
-; FAST-NEXT: stxvd2x vs3, r30, r3
-; FAST-NEXT: li r3, 304
-; FAST-NEXT: stxvd2x vs4, 0, r30
-; FAST-NEXT: lfd f31, 472(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f30, 464(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f29, 456(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f28, 448(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f27, 440(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f26, 432(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f25, 424(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f24, 416(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f23, 408(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f22, 400(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f21, 392(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f20, 384(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f19, 376(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f18, 368(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f17, 360(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f16, 352(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f15, 344(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f14, 336(r1) # 8-byte Folded Reload
-; FAST-NEXT: lxvd2x v31, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 288
-; FAST-NEXT: ld r30, 320(r1) # 8-byte Folded Reload
-; FAST-NEXT: lxvd2x v30, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 272
-; FAST-NEXT: lxvd2x v29, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 256
-; FAST-NEXT: lxvd2x v28, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 240
-; FAST-NEXT: lxvd2x v27, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 224
-; FAST-NEXT: lxvd2x v26, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 208
-; FAST-NEXT: lxvd2x v25, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 192
-; FAST-NEXT: lxvd2x v24, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 176
-; FAST-NEXT: lxvd2x v23, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 160
-; FAST-NEXT: lxvd2x v22, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 144
-; FAST-NEXT: lxvd2x v21, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 128
-; FAST-NEXT: lxvd2x v20, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: addi r1, r1, 480
-; FAST-NEXT: ld r0, 16(r1)
-; FAST-NEXT: mtlr r0
-; FAST-NEXT: blr
%a = call <32 x i64> @llvm.llrint.v32i64.v32f16(<32 x half> %x)
ret <32 x i64> %a
}
@@ -3385,12 +2413,6 @@ define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: llrint_v1i64_v1f32:
-; FAST: # %bb.0:
-; FAST-NEXT: fctid f0, f1
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: blr
%a = call <1 x i64> @llvm.llrint.v1i64.v1f32(<1 x float> %x)
ret <1 x i64> %a
}
@@ -3444,21 +2466,6 @@ define <2 x i64> @llrint_v2i64_v2f32(<2 x float> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: llrint_v2i64_v2f32:
-; FAST: # %bb.0:
-; FAST-NEXT: xxsldwi vs0, v2, v2, 3
-; FAST-NEXT: xxswapd vs1, v2
-; FAST-NEXT: xscvspdpn f0, vs0
-; FAST-NEXT: xscvspdpn f1, vs1
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: fctid f1, f1
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: mffprd r3, f1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: xxmrghd v2, vs1, vs0
-; FAST-NEXT: blr
%a = call <2 x i64> @llvm.llrint.v2i64.v2f32(<2 x float> %x)
ret <2 x i64> %a
}
@@ -3537,32 +2544,6 @@ define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: llrint_v4i64_v4f32:
-; FAST: # %bb.0:
-; FAST-NEXT: xxsldwi vs0, v2, v2, 3
-; FAST-NEXT: xxswapd vs1, v2
-; FAST-NEXT: xscvspdpn f0, vs0
-; FAST-NEXT: xxsldwi vs2, v2, v2, 1
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: xscvspdpn f0, vs1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v4, vs0, vs1
-; FAST-NEXT: xscvspdpn f0, v2
-; FAST-NEXT: vmr v2, v4
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: xscvspdpn f0, vs2
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v3, vs1, vs0
-; FAST-NEXT: blr
%a = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> %x)
ret <4 x i64> %a
}
@@ -3695,54 +2676,6 @@ define <8 x i64> @llrint_v8i64_v8f32(<8 x float> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: llrint_v8i64_v8f32:
-; FAST: # %bb.0:
-; FAST-NEXT: xxsldwi vs0, v2, v2, 3
-; FAST-NEXT: xxswapd vs1, v2
-; FAST-NEXT: xscvspdpn f0, vs0
-; FAST-NEXT: xxsldwi vs2, v2, v2, 1
-; FAST-NEXT: xxsldwi vs3, v3, v3, 3
-; FAST-NEXT: xxswapd vs4, v3
-; FAST-NEXT: xxsldwi vs5, v3, v3, 1
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: xscvspdpn f0, vs1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v0, vs0, vs1
-; FAST-NEXT: xscvspdpn f0, v2
-; FAST-NEXT: vmr v2, v0
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: xscvspdpn f0, vs2
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v1, vs1, vs0
-; FAST-NEXT: xscvspdpn f0, vs3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: xscvspdpn f0, vs4
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v4, vs0, vs1
-; FAST-NEXT: xscvspdpn f0, v3
-; FAST-NEXT: vmr v3, v1
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: xscvspdpn f0, vs5
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v5, vs1, vs0
-; FAST-NEXT: blr
%a = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> %x)
ret <8 x i64> %a
}
@@ -3983,98 +2916,6 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: llrint_v16i64_v16f32:
-; FAST: # %bb.0:
-; FAST-NEXT: xxsldwi vs0, v2, v2, 3
-; FAST-NEXT: xxswapd vs1, v2
-; FAST-NEXT: xscvspdpn f0, vs0
-; FAST-NEXT: xxsldwi vs2, v2, v2, 1
-; FAST-NEXT: xxsldwi vs3, v3, v3, 3
-; FAST-NEXT: xxswapd vs4, v3
-; FAST-NEXT: xxsldwi vs5, v3, v3, 1
-; FAST-NEXT: xxsldwi vs6, v4, v4, 3
-; FAST-NEXT: xxswapd vs7, v4
-; FAST-NEXT: xxsldwi vs8, v4, v4, 1
-; FAST-NEXT: xxsldwi vs9, v5, v5, 3
-; FAST-NEXT: xxswapd vs10, v5
-; FAST-NEXT: xxsldwi vs11, v5, v5, 1
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: xscvspdpn f0, vs1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v0, vs0, vs1
-; FAST-NEXT: xscvspdpn f0, v2
-; FAST-NEXT: vmr v2, v0
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: xscvspdpn f0, vs2
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v1, vs1, vs0
-; FAST-NEXT: xscvspdpn f0, vs3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: xscvspdpn f0, vs4
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v10, vs0, vs1
-; FAST-NEXT: xscvspdpn f0, v3
-; FAST-NEXT: vmr v3, v1
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: xscvspdpn f0, vs5
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v11, vs1, vs0
-; FAST-NEXT: xscvspdpn f0, vs6
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: xscvspdpn f0, vs7
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v6, vs0, vs1
-; FAST-NEXT: xscvspdpn f0, v4
-; FAST-NEXT: xscvspdpn f1, vs8
-; FAST-NEXT: vmr v4, v10
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: fctid f1, f1
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: mffprd r3, f1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: xxmrghd v7, vs0, vs1
-; FAST-NEXT: xscvspdpn f0, vs9
-; FAST-NEXT: xscvspdpn f1, vs10
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: fctid f1, f1
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: mffprd r3, f1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: xxmrghd v8, vs1, vs0
-; FAST-NEXT: xscvspdpn f0, v5
-; FAST-NEXT: xscvspdpn f1, vs11
-; FAST-NEXT: vmr v5, v11
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: fctid f1, f1
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: mffprd r3, f1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: xxmrghd v9, vs0, vs1
-; FAST-NEXT: blr
%a = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> %x)
ret <16 x i64> %a
}
@@ -4104,12 +2945,6 @@ define <1 x i64> @llrint_v1i64_v1f64(<1 x double> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: llrint_v1i64_v1f64:
-; FAST: # %bb.0:
-; FAST-NEXT: fctid f0, f1
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: blr
%a = call <1 x i64> @llvm.llrint.v1i64.v1f64(<1 x double> %x)
ret <1 x i64> %a
}
@@ -4164,19 +2999,6 @@ define <2 x i64> @llrint_v2i64_v2f64(<2 x double> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: llrint_v2i64_v2f64:
-; FAST: # %bb.0:
-; FAST-NEXT: xxlor f1, v2, v2
-; FAST-NEXT: xxswapd vs0, v2
-; FAST-NEXT: fctid f1, f1
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v2, vs1, vs0
-; FAST-NEXT: blr
%a = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> %x)
ret <2 x i64> %a
}
@@ -4261,28 +3083,6 @@ define <4 x i64> @llrint_v4i64_v4f64(<4 x double> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: llrint_v4i64_v4f64:
-; FAST: # %bb.0:
-; FAST-NEXT: xxswapd vs0, v2
-; FAST-NEXT: xxlor f2, v2, v2
-; FAST-NEXT: xxswapd vs1, v3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: fctid f2, f2
-; FAST-NEXT: fctid f1, f1
-; FAST-NEXT: mffprd r4, f0
-; FAST-NEXT: xxlor f0, v3, v3
-; FAST-NEXT: mffprd r3, f2
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mtfprd f2, r4
-; FAST-NEXT: mffprd r5, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: mffprd r3, f1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: xxmrghd v2, vs0, vs2
-; FAST-NEXT: mtfprd f0, r5
-; FAST-NEXT: xxmrghd v3, vs0, vs1
-; FAST-NEXT: blr
%a = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> %x)
ret <4 x i64> %a
}
@@ -4427,46 +3227,6 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: llrint_v8i64_v8f64:
-; FAST: # %bb.0:
-; FAST-NEXT: xxswapd vs0, v2
-; FAST-NEXT: xxswapd vs1, v3
-; FAST-NEXT: xxlor f4, v2, v2
-; FAST-NEXT: xxswapd vs2, v4
-; FAST-NEXT: xxswapd vs3, v5
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: fctid f4, f4
-; FAST-NEXT: mffprd r4, f0
-; FAST-NEXT: xxlor f0, v3, v3
-; FAST-NEXT: mffprd r3, f4
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r5, f0
-; FAST-NEXT: fctid f0, f1
-; FAST-NEXT: mtfprd f1, r4
-; FAST-NEXT: mffprd r6, f0
-; FAST-NEXT: xxlor f0, v4, v4
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mtfprd f4, r6
-; FAST-NEXT: mffprd r7, f0
-; FAST-NEXT: fctid f0, f2
-; FAST-NEXT: mtfprd f2, r5
-; FAST-NEXT: mtfprd f5, r7
-; FAST-NEXT: mffprd r8, f0
-; FAST-NEXT: xxlor f0, v5, v5
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mtfprd f6, r8
-; FAST-NEXT: mffprd r9, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v3, vs2, vs4
-; FAST-NEXT: xxmrghd v4, vs5, vs6
-; FAST-NEXT: xxmrghd v2, vs0, vs1
-; FAST-NEXT: fctid f1, f3
-; FAST-NEXT: mtfprd f0, r9
-; FAST-NEXT: mffprd r3, f1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: xxmrghd v5, vs0, vs1
-; FAST-NEXT: blr
%a = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> %x)
ret <8 x i64> %a
}
@@ -4496,18 +3256,6 @@ define <1 x i64> @llrint_v1i64_v1f128(<1 x fp128> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: llrint_v1i64_v1f128:
-; FAST: # %bb.0:
-; FAST-NEXT: mflr r0
-; FAST-NEXT: stdu r1, -32(r1)
-; FAST-NEXT: std r0, 48(r1)
-; FAST-NEXT: bl llrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: addi r1, r1, 32
-; FAST-NEXT: ld r0, 16(r1)
-; FAST-NEXT: mtlr r0
-; FAST-NEXT: blr
%a = call <1 x i64> @llvm.llrint.v1i64.v1f128(<1 x fp128> %x)
ret <1 x i64> %a
}
@@ -4565,33 +3313,6 @@ define <2 x i64> @llrint_v2i64_v2f128(<2 x fp128> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: llrint_v2i64_v2f128:
-; FAST: # %bb.0:
-; FAST-NEXT: mflr r0
-; FAST-NEXT: stdu r1, -80(r1)
-; FAST-NEXT: li r3, 48
-; FAST-NEXT: std r0, 96(r1)
-; FAST-NEXT: stxvd2x v30, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 64
-; FAST-NEXT: stxvd2x v31, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: vmr v31, v3
-; FAST-NEXT: bl llrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: vmr v2, v31
-; FAST-NEXT: mtvsrd v30, r3
-; FAST-NEXT: bl llrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: li r3, 64
-; FAST-NEXT: lxvd2x v31, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 48
-; FAST-NEXT: xxmrghd v2, vs0, v30
-; FAST-NEXT: lxvd2x v30, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: addi r1, r1, 80
-; FAST-NEXT: ld r0, 16(r1)
-; FAST-NEXT: mtlr r0
-; FAST-NEXT: blr
%a = call <2 x i64> @llvm.llrint.v2i64.v2f128(<2 x fp128> %x)
ret <2 x i64> %a
}
@@ -4689,53 +3410,6 @@ define <4 x i64> @llrint_v4i64_v4f128(<4 x fp128> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: llrint_v4i64_v4f128:
-; FAST: # %bb.0:
-; FAST-NEXT: mflr r0
-; FAST-NEXT: stdu r1, -112(r1)
-; FAST-NEXT: li r3, 48
-; FAST-NEXT: std r0, 128(r1)
-; FAST-NEXT: stxvd2x v28, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 64
-; FAST-NEXT: stxvd2x v29, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 80
-; FAST-NEXT: vmr v29, v3
-; FAST-NEXT: stxvd2x v30, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 96
-; FAST-NEXT: vmr v30, v4
-; FAST-NEXT: stxvd2x v31, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: vmr v31, v5
-; FAST-NEXT: bl llrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: vmr v2, v29
-; FAST-NEXT: mtvsrd v28, r3
-; FAST-NEXT: bl llrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: vmr v2, v30
-; FAST-NEXT: xxmrghd v29, vs0, v28
-; FAST-NEXT: bl llrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: vmr v2, v31
-; FAST-NEXT: mtvsrd v30, r3
-; FAST-NEXT: bl llrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: li r3, 96
-; FAST-NEXT: vmr v2, v29
-; FAST-NEXT: lxvd2x v31, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 80
-; FAST-NEXT: xxmrghd v3, vs0, v30
-; FAST-NEXT: lxvd2x v30, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 64
-; FAST-NEXT: lxvd2x v29, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 48
-; FAST-NEXT: lxvd2x v28, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: addi r1, r1, 112
-; FAST-NEXT: ld r0, 16(r1)
-; FAST-NEXT: mtlr r0
-; FAST-NEXT: blr
%a = call <4 x i64> @llvm.llrint.v4i64.v4f128(<4 x fp128> %x)
ret <4 x i64> %a
}
@@ -4913,93 +3587,6 @@ define <8 x i64> @llrint_v8i64_v8f128(<8 x fp128> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: llrint_v8i64_v8f128:
-; FAST: # %bb.0:
-; FAST-NEXT: mflr r0
-; FAST-NEXT: stdu r1, -176(r1)
-; FAST-NEXT: li r3, 48
-; FAST-NEXT: std r0, 192(r1)
-; FAST-NEXT: stxvd2x v24, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 64
-; FAST-NEXT: stxvd2x v25, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 80
-; FAST-NEXT: vmr v25, v3
-; FAST-NEXT: stxvd2x v26, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 96
-; FAST-NEXT: vmr v26, v4
-; FAST-NEXT: stxvd2x v27, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 112
-; FAST-NEXT: vmr v27, v5
-; FAST-NEXT: stxvd2x v28, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 128
-; FAST-NEXT: vmr v28, v6
-; FAST-NEXT: stxvd2x v29, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 144
-; FAST-NEXT: vmr v29, v7
-; FAST-NEXT: stxvd2x v30, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 160
-; FAST-NEXT: vmr v30, v8
-; FAST-NEXT: stxvd2x v31, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: vmr v31, v9
-; FAST-NEXT: bl llrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: vmr v2, v25
-; FAST-NEXT: mtvsrd v24, r3
-; FAST-NEXT: bl llrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: vmr v2, v26
-; FAST-NEXT: xxmrghd v25, vs0, v24
-; FAST-NEXT: bl llrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: vmr v2, v27
-; FAST-NEXT: mtvsrd v26, r3
-; FAST-NEXT: bl llrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: vmr v2, v28
-; FAST-NEXT: xxmrghd v27, vs0, v26
-; FAST-NEXT: bl llrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: vmr v2, v29
-; FAST-NEXT: mtvsrd v28, r3
-; FAST-NEXT: bl llrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: vmr v2, v30
-; FAST-NEXT: xxmrghd v29, vs0, v28
-; FAST-NEXT: bl llrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: vmr v2, v31
-; FAST-NEXT: mtvsrd v30, r3
-; FAST-NEXT: bl llrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: li r3, 160
-; FAST-NEXT: vmr v4, v29
-; FAST-NEXT: lxvd2x v31, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 144
-; FAST-NEXT: vmr v3, v27
-; FAST-NEXT: vmr v2, v25
-; FAST-NEXT: xxmrghd v5, vs0, v30
-; FAST-NEXT: lxvd2x v30, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 128
-; FAST-NEXT: lxvd2x v29, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 112
-; FAST-NEXT: lxvd2x v28, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 96
-; FAST-NEXT: lxvd2x v27, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 80
-; FAST-NEXT: lxvd2x v26, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 64
-; FAST-NEXT: lxvd2x v25, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 48
-; FAST-NEXT: lxvd2x v24, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: addi r1, r1, 176
-; FAST-NEXT: ld r0, 16(r1)
-; FAST-NEXT: mtlr r0
-; FAST-NEXT: blr
%a = call <8 x i64> @llvm.llrint.v8i64.v8f128(<8 x fp128> %x)
ret <8 x i64> %a
}
diff --git a/llvm/test/CodeGen/PowerPC/vector-lrint.ll b/llvm/test/CodeGen/PowerPC/vector-lrint.ll
index f437536..af5704b 100644
--- a/llvm/test/CodeGen/PowerPC/vector-lrint.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-lrint.ll
@@ -9,10 +9,6 @@
; RUN: sed 's/iXLen/i32/g' %s | llc -mcpu=pwr8 -ppc-asm-full-reg-names \
; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-unknown \
; RUN: -verify-machineinstrs | FileCheck %s
-; RUN: sed 's/iXLen/i32/g' %s | llc -mcpu=pwr8 -ppc-asm-full-reg-names \
-; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-unknown \
-; RUN: -verify-machineinstrs --enable-unsafe-fp-math | \
-; RUN: FileCheck %s --check-prefixes=FAST
; FIXME: crash "Input type needs to be promoted!"
; SKIP: sed 's/iXLen/i64/g' %s | llc -ppc-asm-full-reg-names \
; SKIP: -ppc-vsr-nums-as-vr -mtriple=powerpc-unknown-unknown \
@@ -23,10 +19,6 @@
; RUN: sed 's/iXLen/i64/g' %s | llc -mcpu=pwr8 -ppc-asm-full-reg-names \
; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-unknown \
; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
-; RUN: sed 's/iXLen/i64/g' %s | llc -mcpu=pwr8 -ppc-asm-full-reg-names \
-; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-unknown \
-; RUN: -verify-machineinstrs --enable-unsafe-fp-math | \
-; RUN: FileCheck %s --check-prefixes=FAST
define <1 x i64> @lrint_v1f16(<1 x half> %x) nounwind {
; BE-LABEL: lrint_v1f16:
@@ -62,23 +54,6 @@ define <1 x i64> @lrint_v1f16(<1 x half> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: lrint_v1f16:
-; FAST: # %bb.0:
-; FAST-NEXT: mflr r0
-; FAST-NEXT: stdu r1, -32(r1)
-; FAST-NEXT: std r0, 48(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fctid f0, f1
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: addi r1, r1, 32
-; FAST-NEXT: ld r0, 16(r1)
-; FAST-NEXT: mtlr r0
-; FAST-NEXT: blr
%a = call <1 x i64> @llvm.lrint.v1i64.v1f16(<1 x half> %x)
ret <1 x i64> %a
}
@@ -162,41 +137,6 @@ define <2 x i64> @lrint_v2f16(<2 x half> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: lrint_v2f16:
-; FAST: # %bb.0:
-; FAST-NEXT: mflr r0
-; FAST-NEXT: stfd f30, -16(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f31, -8(r1) # 8-byte Folded Spill
-; FAST-NEXT: stdu r1, -48(r1)
-; FAST-NEXT: fmr f31, f1
-; FAST-NEXT: fmr f1, f2
-; FAST-NEXT: std r0, 64(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f30, f1
-; FAST-NEXT: fmr f1, f31
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fctid f0, f1
-; FAST-NEXT: fctid f1, f30
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: mffprd r3, f1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: xxmrghd v2, vs1, vs0
-; FAST-NEXT: addi r1, r1, 48
-; FAST-NEXT: ld r0, 16(r1)
-; FAST-NEXT: lfd f31, -8(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f30, -16(r1) # 8-byte Folded Reload
-; FAST-NEXT: mtlr r0
-; FAST-NEXT: blr
%a = call <2 x i64> @llvm.lrint.v2i64.v2f16(<2 x half> %x)
ret <2 x i64> %a
}
@@ -356,68 +296,6 @@ define <4 x i64> @lrint_v4f16(<4 x half> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: lrint_v4f16:
-; FAST: # %bb.0:
-; FAST-NEXT: mflr r0
-; FAST-NEXT: stfd f28, -32(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f29, -24(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f30, -16(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f31, -8(r1) # 8-byte Folded Spill
-; FAST-NEXT: stdu r1, -64(r1)
-; FAST-NEXT: fmr f29, f1
-; FAST-NEXT: fmr f1, f4
-; FAST-NEXT: std r0, 80(r1)
-; FAST-NEXT: fmr f31, f3
-; FAST-NEXT: fmr f30, f2
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f28, f1
-; FAST-NEXT: fmr f1, f31
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f31, f1
-; FAST-NEXT: fmr f1, f30
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f30, f1
-; FAST-NEXT: fmr f1, f29
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fctid f0, f30
-; FAST-NEXT: fctid f2, f31
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: fctid f1, f1
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: mffprd r3, f2
-; FAST-NEXT: mtfprd f2, r3
-; FAST-NEXT: mffprd r3, f1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: xxmrghd v2, vs0, vs1
-; FAST-NEXT: fctid f0, f28
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v3, vs0, vs2
-; FAST-NEXT: addi r1, r1, 64
-; FAST-NEXT: ld r0, 16(r1)
-; FAST-NEXT: lfd f31, -8(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f30, -16(r1) # 8-byte Folded Reload
-; FAST-NEXT: mtlr r0
-; FAST-NEXT: lfd f29, -24(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f28, -32(r1) # 8-byte Folded Reload
-; FAST-NEXT: blr
%a = call <4 x i64> @llvm.lrint.v4i64.v4f16(<4 x half> %x)
ret <4 x i64> %a
}
@@ -729,122 +607,6 @@ define <8 x i64> @lrint_v8f16(<8 x half> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: lrint_v8f16:
-; FAST: # %bb.0:
-; FAST-NEXT: mflr r0
-; FAST-NEXT: stfd f24, -64(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f25, -56(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f26, -48(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f27, -40(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f28, -32(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f29, -24(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f30, -16(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f31, -8(r1) # 8-byte Folded Spill
-; FAST-NEXT: stdu r1, -96(r1)
-; FAST-NEXT: fmr f24, f1
-; FAST-NEXT: fmr f1, f8
-; FAST-NEXT: std r0, 112(r1)
-; FAST-NEXT: fmr f30, f7
-; FAST-NEXT: fmr f29, f6
-; FAST-NEXT: fmr f28, f5
-; FAST-NEXT: fmr f27, f4
-; FAST-NEXT: fmr f26, f3
-; FAST-NEXT: fmr f25, f2
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f31, f1
-; FAST-NEXT: fmr f1, f30
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f30, f1
-; FAST-NEXT: fmr f1, f29
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f29, f1
-; FAST-NEXT: fmr f1, f28
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f28, f1
-; FAST-NEXT: fmr f1, f27
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f27, f1
-; FAST-NEXT: fmr f1, f26
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f26, f1
-; FAST-NEXT: fmr f1, f25
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f25, f1
-; FAST-NEXT: fmr f1, f24
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fctid f0, f25
-; FAST-NEXT: fctid f2, f26
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: fctid f3, f27
-; FAST-NEXT: fctid f4, f28
-; FAST-NEXT: fctid f5, f29
-; FAST-NEXT: fctid f6, f30
-; FAST-NEXT: fctid f1, f1
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: mffprd r3, f2
-; FAST-NEXT: mtfprd f2, r3
-; FAST-NEXT: mffprd r3, f3
-; FAST-NEXT: mtfprd f3, r3
-; FAST-NEXT: mffprd r3, f4
-; FAST-NEXT: mtfprd f4, r3
-; FAST-NEXT: mffprd r3, f5
-; FAST-NEXT: mtfprd f5, r3
-; FAST-NEXT: mffprd r3, f6
-; FAST-NEXT: mtfprd f6, r3
-; FAST-NEXT: mffprd r3, f1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: xxmrghd v3, vs3, vs2
-; FAST-NEXT: xxmrghd v4, vs5, vs4
-; FAST-NEXT: xxmrghd v2, vs0, vs1
-; FAST-NEXT: fctid f0, f31
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v5, vs0, vs6
-; FAST-NEXT: addi r1, r1, 96
-; FAST-NEXT: ld r0, 16(r1)
-; FAST-NEXT: lfd f31, -8(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f30, -16(r1) # 8-byte Folded Reload
-; FAST-NEXT: mtlr r0
-; FAST-NEXT: lfd f29, -24(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f28, -32(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f27, -40(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f26, -48(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f25, -56(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f24, -64(r1) # 8-byte Folded Reload
-; FAST-NEXT: blr
%a = call <8 x i64> @llvm.lrint.v8i64.v8f16(<8 x half> %x)
ret <8 x i64> %a
}
@@ -1454,228 +1216,6 @@ define <16 x i64> @lrint_v16i64_v16f16(<16 x half> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: lrint_v16i64_v16f16:
-; FAST: # %bb.0:
-; FAST-NEXT: mflr r0
-; FAST-NEXT: stfd f16, -128(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f17, -120(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f18, -112(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f19, -104(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f20, -96(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f21, -88(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f22, -80(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f23, -72(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f24, -64(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f25, -56(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f26, -48(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f27, -40(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f28, -32(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f29, -24(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f30, -16(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f31, -8(r1) # 8-byte Folded Spill
-; FAST-NEXT: stdu r1, -160(r1)
-; FAST-NEXT: fmr f26, f1
-; FAST-NEXT: lfs f1, 312(r1)
-; FAST-NEXT: std r0, 176(r1)
-; FAST-NEXT: fmr f28, f13
-; FAST-NEXT: fmr f27, f12
-; FAST-NEXT: fmr f24, f11
-; FAST-NEXT: fmr f21, f10
-; FAST-NEXT: fmr f19, f9
-; FAST-NEXT: fmr f18, f8
-; FAST-NEXT: fmr f17, f7
-; FAST-NEXT: fmr f16, f6
-; FAST-NEXT: fmr f20, f5
-; FAST-NEXT: fmr f22, f4
-; FAST-NEXT: fmr f23, f3
-; FAST-NEXT: fmr f25, f2
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f31, f1
-; FAST-NEXT: lfs f1, 304(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f30, f1
-; FAST-NEXT: lfs f1, 296(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f29, f1
-; FAST-NEXT: fmr f1, f28
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f28, f1
-; FAST-NEXT: fmr f1, f27
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f27, f1
-; FAST-NEXT: fmr f1, f24
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f24, f1
-; FAST-NEXT: fmr f1, f21
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f21, f1
-; FAST-NEXT: fmr f1, f19
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f19, f1
-; FAST-NEXT: fmr f1, f18
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f18, f1
-; FAST-NEXT: fmr f1, f17
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f17, f1
-; FAST-NEXT: fmr f1, f16
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f16, f1
-; FAST-NEXT: fmr f1, f20
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f20, f1
-; FAST-NEXT: fmr f1, f22
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f22, f1
-; FAST-NEXT: fmr f1, f23
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f23, f1
-; FAST-NEXT: fmr f1, f25
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f25, f1
-; FAST-NEXT: fmr f1, f26
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fctid f0, f25
-; FAST-NEXT: fctid f2, f23
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: fctid f3, f22
-; FAST-NEXT: fctid f4, f20
-; FAST-NEXT: fctid f5, f16
-; FAST-NEXT: fctid f6, f17
-; FAST-NEXT: fctid f7, f18
-; FAST-NEXT: fctid f8, f19
-; FAST-NEXT: fctid f9, f21
-; FAST-NEXT: fctid f10, f24
-; FAST-NEXT: fctid f1, f1
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: mffprd r3, f2
-; FAST-NEXT: mtfprd f2, r3
-; FAST-NEXT: mffprd r3, f3
-; FAST-NEXT: mtfprd f3, r3
-; FAST-NEXT: mffprd r3, f4
-; FAST-NEXT: mtfprd f4, r3
-; FAST-NEXT: mffprd r3, f5
-; FAST-NEXT: mtfprd f5, r3
-; FAST-NEXT: mffprd r3, f6
-; FAST-NEXT: mtfprd f6, r3
-; FAST-NEXT: mffprd r3, f7
-; FAST-NEXT: mtfprd f7, r3
-; FAST-NEXT: mffprd r3, f8
-; FAST-NEXT: mtfprd f8, r3
-; FAST-NEXT: mffprd r3, f9
-; FAST-NEXT: mtfprd f9, r3
-; FAST-NEXT: mffprd r3, f10
-; FAST-NEXT: mtfprd f10, r3
-; FAST-NEXT: mffprd r3, f1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: xxmrghd v3, vs3, vs2
-; FAST-NEXT: xxmrghd v4, vs5, vs4
-; FAST-NEXT: xxmrghd v5, vs7, vs6
-; FAST-NEXT: xxmrghd v6, vs9, vs8
-; FAST-NEXT: xxmrghd v2, vs0, vs1
-; FAST-NEXT: fctid f0, f27
-; FAST-NEXT: fctid f1, f29
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v7, vs0, vs10
-; FAST-NEXT: fctid f0, f28
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: mffprd r3, f1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: xxmrghd v8, vs1, vs0
-; FAST-NEXT: fctid f0, f30
-; FAST-NEXT: fctid f1, f31
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: mffprd r3, f1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: xxmrghd v9, vs1, vs0
-; FAST-NEXT: addi r1, r1, 160
-; FAST-NEXT: ld r0, 16(r1)
-; FAST-NEXT: lfd f31, -8(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f30, -16(r1) # 8-byte Folded Reload
-; FAST-NEXT: mtlr r0
-; FAST-NEXT: lfd f29, -24(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f28, -32(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f27, -40(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f26, -48(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f25, -56(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f24, -64(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f23, -72(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f22, -80(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f21, -88(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f20, -96(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f19, -104(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f18, -112(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f17, -120(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f16, -128(r1) # 8-byte Folded Reload
-; FAST-NEXT: blr
%a = call <16 x i64> @llvm.lrint.v16i64.v16f16(<16 x half> %x)
ret <16 x i64> %a
}
@@ -2854,523 +2394,6 @@ define <32 x i64> @lrint_v32i64_v32f16(<32 x half> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: lrint_v32i64_v32f16:
-; FAST: # %bb.0:
-; FAST-NEXT: mflr r0
-; FAST-NEXT: stdu r1, -480(r1)
-; FAST-NEXT: li r4, 128
-; FAST-NEXT: std r0, 496(r1)
-; FAST-NEXT: std r30, 320(r1) # 8-byte Folded Spill
-; FAST-NEXT: mr r30, r3
-; FAST-NEXT: stfd f14, 336(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f15, 344(r1) # 8-byte Folded Spill
-; FAST-NEXT: fmr f14, f5
-; FAST-NEXT: stfd f16, 352(r1) # 8-byte Folded Spill
-; FAST-NEXT: stxvd2x v20, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT: li r4, 144
-; FAST-NEXT: fmr f16, f4
-; FAST-NEXT: stfd f17, 360(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f18, 368(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f19, 376(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f20, 384(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f21, 392(r1) # 8-byte Folded Spill
-; FAST-NEXT: stxvd2x v21, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT: li r4, 160
-; FAST-NEXT: stfd f22, 400(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f23, 408(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f24, 416(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f25, 424(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f26, 432(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f27, 440(r1) # 8-byte Folded Spill
-; FAST-NEXT: stxvd2x v22, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT: li r4, 176
-; FAST-NEXT: xxlor v22, f3, f3
-; FAST-NEXT: stfd f28, 448(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f29, 456(r1) # 8-byte Folded Spill
-; FAST-NEXT: fmr f29, f9
-; FAST-NEXT: stfd f30, 464(r1) # 8-byte Folded Spill
-; FAST-NEXT: stfd f31, 472(r1) # 8-byte Folded Spill
-; FAST-NEXT: stxvd2x v23, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT: li r4, 192
-; FAST-NEXT: xxlor v23, f2, f2
-; FAST-NEXT: stxvd2x v24, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT: li r4, 208
-; FAST-NEXT: stxvd2x v25, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT: li r4, 224
-; FAST-NEXT: xxlor v25, f13, f13
-; FAST-NEXT: stxvd2x v26, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT: li r4, 240
-; FAST-NEXT: xxlor v26, f12, f12
-; FAST-NEXT: stxvd2x v27, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT: li r4, 256
-; FAST-NEXT: xxlor v27, f11, f11
-; FAST-NEXT: stxvd2x v28, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT: li r4, 272
-; FAST-NEXT: xxlor v28, f10, f10
-; FAST-NEXT: stxvd2x v29, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT: li r4, 288
-; FAST-NEXT: xxlor v29, f8, f8
-; FAST-NEXT: stxvd2x v30, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT: li r4, 304
-; FAST-NEXT: xxlor v30, f7, f7
-; FAST-NEXT: stxvd2x v31, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT: li r4, 44
-; FAST-NEXT: xxlor v31, f6, f6
-; FAST-NEXT: stxsspx f1, r1, r4 # 4-byte Folded Spill
-; FAST-NEXT: lfs f1, 768(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: li r3, 120
-; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT: lfs f1, 760(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: li r3, 112
-; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT: lfs f1, 752(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: li r3, 104
-; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT: lfs f1, 744(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: li r3, 96
-; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT: lfs f1, 736(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: li r3, 88
-; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT: lfs f1, 728(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: li r3, 80
-; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT: lfs f1, 720(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: li r3, 72
-; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT: lfs f1, 712(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: li r3, 64
-; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT: lfs f1, 704(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: li r3, 56
-; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT: lfs f1, 696(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: li r3, 48
-; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT: lfs f1, 688(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: xxlor v21, f1, f1
-; FAST-NEXT: lfs f1, 680(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: xxlor v20, f1, f1
-; FAST-NEXT: lfs f1, 672(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: xxlor v24, f1, f1
-; FAST-NEXT: lfs f1, 664(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f31, f1
-; FAST-NEXT: lfs f1, 656(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f30, f1
-; FAST-NEXT: lfs f1, 648(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f28, f1
-; FAST-NEXT: lfs f1, 640(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f27, f1
-; FAST-NEXT: lfs f1, 632(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f26, f1
-; FAST-NEXT: lfs f1, 624(r1)
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f25, f1
-; FAST-NEXT: xxlor f1, v25, v25
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f24, f1
-; FAST-NEXT: xxlor f1, v26, v26
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f23, f1
-; FAST-NEXT: xxlor f1, v27, v27
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f22, f1
-; FAST-NEXT: xxlor f1, v28, v28
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f21, f1
-; FAST-NEXT: fmr f1, f29
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f20, f1
-; FAST-NEXT: xxlor f1, v29, v29
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f19, f1
-; FAST-NEXT: xxlor f1, v30, v30
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f18, f1
-; FAST-NEXT: xxlor f1, v31, v31
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f29, f1
-; FAST-NEXT: fmr f1, f14
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f14, f1
-; FAST-NEXT: fmr f1, f16
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f16, f1
-; FAST-NEXT: xxlor f1, v22, v22
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fmr f17, f1
-; FAST-NEXT: xxlor f1, v23, v23
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: li r3, 44
-; FAST-NEXT: fmr f15, f1
-; FAST-NEXT: lxsspx f1, r1, r3 # 4-byte Folded Reload
-; FAST-NEXT: bl __truncsfhf2
-; FAST-NEXT: nop
-; FAST-NEXT: clrldi r3, r3, 48
-; FAST-NEXT: bl __extendhfsf2
-; FAST-NEXT: nop
-; FAST-NEXT: fctid f3, f15
-; FAST-NEXT: fctid f4, f17
-; FAST-NEXT: mffprd r3, f3
-; FAST-NEXT: fctid f5, f16
-; FAST-NEXT: fctid f6, f14
-; FAST-NEXT: fctid f7, f18
-; FAST-NEXT: fctid f8, f19
-; FAST-NEXT: fctid f13, f1
-; FAST-NEXT: fctid f9, f20
-; FAST-NEXT: fctid f10, f22
-; FAST-NEXT: fctid f11, f24
-; FAST-NEXT: fctid f12, f25
-; FAST-NEXT: fctid f2, f23
-; FAST-NEXT: fctid f0, f21
-; FAST-NEXT: mtvsrd v2, r3
-; FAST-NEXT: mffprd r3, f4
-; FAST-NEXT: mtvsrd v3, r3
-; FAST-NEXT: mffprd r3, f5
-; FAST-NEXT: mtfprd f5, r3
-; FAST-NEXT: mffprd r3, f6
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: mffprd r3, f7
-; FAST-NEXT: mtfprd f6, r3
-; FAST-NEXT: mffprd r3, f8
-; FAST-NEXT: mtfprd f7, r3
-; FAST-NEXT: mffprd r3, f9
-; FAST-NEXT: mtfprd f3, r3
-; FAST-NEXT: mffprd r3, f10
-; FAST-NEXT: mtfprd f4, r3
-; FAST-NEXT: mffprd r3, f11
-; FAST-NEXT: fctid f11, f31
-; FAST-NEXT: lfd f31, 56(r1) # 8-byte Folded Reload
-; FAST-NEXT: mtfprd f8, r3
-; FAST-NEXT: mffprd r3, f12
-; FAST-NEXT: xxlor f12, v24, v24
-; FAST-NEXT: fctid f31, f31
-; FAST-NEXT: fctid f12, f12
-; FAST-NEXT: mtfprd f9, r3
-; FAST-NEXT: mffprd r3, f13
-; FAST-NEXT: lfd f13, 48(r1) # 8-byte Folded Reload
-; FAST-NEXT: mtfprd f10, r3
-; FAST-NEXT: fctid f13, f13
-; FAST-NEXT: xxmrghd v3, vs5, v3
-; FAST-NEXT: fctid f5, f26
-; FAST-NEXT: mffprd r3, f5
-; FAST-NEXT: mtfprd f5, r3
-; FAST-NEXT: xxmrghd v4, vs7, vs6
-; FAST-NEXT: fctid f6, f27
-; FAST-NEXT: fctid f7, f28
-; FAST-NEXT: mffprd r3, f6
-; FAST-NEXT: lfd f28, 96(r1) # 8-byte Folded Reload
-; FAST-NEXT: fctid f28, f28
-; FAST-NEXT: mtfprd f6, r3
-; FAST-NEXT: mffprd r3, f7
-; FAST-NEXT: mtfprd f7, r3
-; FAST-NEXT: xxmrghd v2, v2, vs10
-; FAST-NEXT: fctid f10, f30
-; FAST-NEXT: mffprd r3, f10
-; FAST-NEXT: lfd f30, 80(r1) # 8-byte Folded Reload
-; FAST-NEXT: fctid f30, f30
-; FAST-NEXT: mtfprd f10, r3
-; FAST-NEXT: mffprd r3, f11
-; FAST-NEXT: mtfprd f11, r3
-; FAST-NEXT: mffprd r3, f12
-; FAST-NEXT: mtfprd f12, r3
-; FAST-NEXT: xxmrghd v5, vs12, vs11
-; FAST-NEXT: xxlor f11, v20, v20
-; FAST-NEXT: xxlor f12, v21, v21
-; FAST-NEXT: fctid f11, f11
-; FAST-NEXT: fctid f12, f12
-; FAST-NEXT: mffprd r3, f11
-; FAST-NEXT: mtfprd f11, r3
-; FAST-NEXT: mffprd r3, f12
-; FAST-NEXT: mtfprd f12, r3
-; FAST-NEXT: mffprd r3, f13
-; FAST-NEXT: mtfprd f13, r3
-; FAST-NEXT: mffprd r3, f31
-; FAST-NEXT: lfd f31, 64(r1) # 8-byte Folded Reload
-; FAST-NEXT: fctid f31, f31
-; FAST-NEXT: mtvsrd v0, r3
-; FAST-NEXT: mffprd r3, f31
-; FAST-NEXT: lfd f31, 72(r1) # 8-byte Folded Reload
-; FAST-NEXT: mtvsrd v1, r3
-; FAST-NEXT: mffprd r3, f30
-; FAST-NEXT: lfd f30, 88(r1) # 8-byte Folded Reload
-; FAST-NEXT: fctid f31, f31
-; FAST-NEXT: mtvsrd v6, r3
-; FAST-NEXT: mffprd r3, f28
-; FAST-NEXT: lfd f28, 104(r1) # 8-byte Folded Reload
-; FAST-NEXT: fctid f30, f30
-; FAST-NEXT: fctid f28, f28
-; FAST-NEXT: mtvsrd v7, r3
-; FAST-NEXT: mffprd r3, f28
-; FAST-NEXT: lfd f28, 112(r1) # 8-byte Folded Reload
-; FAST-NEXT: fctid f28, f28
-; FAST-NEXT: mtvsrd v8, r3
-; FAST-NEXT: mffprd r3, f28
-; FAST-NEXT: lfd f28, 120(r1) # 8-byte Folded Reload
-; FAST-NEXT: fctid f28, f28
-; FAST-NEXT: xxmrghd v10, vs12, vs11
-; FAST-NEXT: xxmrghd v0, v0, vs13
-; FAST-NEXT: xxswapd vs12, v0
-; FAST-NEXT: xxmrghd v0, vs9, vs8
-; FAST-NEXT: xxmrghd v7, v8, v7
-; FAST-NEXT: mtvsrd v8, r3
-; FAST-NEXT: mffprd r3, f28
-; FAST-NEXT: mtvsrd v9, r3
-; FAST-NEXT: mffprd r3, f30
-; FAST-NEXT: xxswapd v7, v7
-; FAST-NEXT: xxmrghd v8, v9, v8
-; FAST-NEXT: mtvsrd v9, r3
-; FAST-NEXT: mffprd r3, f31
-; FAST-NEXT: xxswapd v8, v8
-; FAST-NEXT: xxmrghd v6, v9, v6
-; FAST-NEXT: mtvsrd v9, r3
-; FAST-NEXT: li r3, 240
-; FAST-NEXT: stxvd2x v8, r30, r3
-; FAST-NEXT: li r3, 224
-; FAST-NEXT: stxvd2x v7, r30, r3
-; FAST-NEXT: li r3, 208
-; FAST-NEXT: xxswapd vs11, v6
-; FAST-NEXT: xxmrghd v6, vs10, vs7
-; FAST-NEXT: stxvd2x vs11, r30, r3
-; FAST-NEXT: li r3, 192
-; FAST-NEXT: xxmrghd v1, v9, v1
-; FAST-NEXT: xxswapd vs11, v1
-; FAST-NEXT: xxmrghd v1, vs6, vs5
-; FAST-NEXT: xxswapd vs5, v10
-; FAST-NEXT: xxswapd vs6, v5
-; FAST-NEXT: stxvd2x vs11, r30, r3
-; FAST-NEXT: li r3, 176
-; FAST-NEXT: stxvd2x vs12, r30, r3
-; FAST-NEXT: li r3, 160
-; FAST-NEXT: stxvd2x vs5, r30, r3
-; FAST-NEXT: li r3, 144
-; FAST-NEXT: stxvd2x vs6, r30, r3
-; FAST-NEXT: mffprd r3, f2
-; FAST-NEXT: mtfprd f7, r3
-; FAST-NEXT: li r3, 128
-; FAST-NEXT: xxswapd vs5, v6
-; FAST-NEXT: stxvd2x vs5, r30, r3
-; FAST-NEXT: li r3, 112
-; FAST-NEXT: xxswapd vs2, v1
-; FAST-NEXT: xxswapd vs6, v0
-; FAST-NEXT: stxvd2x vs2, r30, r3
-; FAST-NEXT: li r3, 96
-; FAST-NEXT: fctid f2, f29
-; FAST-NEXT: stxvd2x vs6, r30, r3
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: mffprd r3, f2
-; FAST-NEXT: mtfprd f2, r3
-; FAST-NEXT: li r3, 80
-; FAST-NEXT: xxmrghd v5, vs7, vs4
-; FAST-NEXT: xxswapd vs4, v2
-; FAST-NEXT: xxmrghd v0, vs0, vs3
-; FAST-NEXT: xxswapd vs0, v5
-; FAST-NEXT: xxswapd vs3, v3
-; FAST-NEXT: stxvd2x vs0, r30, r3
-; FAST-NEXT: li r3, 64
-; FAST-NEXT: xxswapd vs0, v0
-; FAST-NEXT: stxvd2x vs0, r30, r3
-; FAST-NEXT: li r3, 48
-; FAST-NEXT: xxmrghd v5, vs2, vs1
-; FAST-NEXT: xxswapd vs1, v4
-; FAST-NEXT: stxvd2x vs1, r30, r3
-; FAST-NEXT: li r3, 32
-; FAST-NEXT: xxswapd vs2, v5
-; FAST-NEXT: stxvd2x vs2, r30, r3
-; FAST-NEXT: li r3, 16
-; FAST-NEXT: stxvd2x vs3, r30, r3
-; FAST-NEXT: li r3, 304
-; FAST-NEXT: stxvd2x vs4, 0, r30
-; FAST-NEXT: lfd f31, 472(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f30, 464(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f29, 456(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f28, 448(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f27, 440(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f26, 432(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f25, 424(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f24, 416(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f23, 408(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f22, 400(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f21, 392(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f20, 384(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f19, 376(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f18, 368(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f17, 360(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f16, 352(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f15, 344(r1) # 8-byte Folded Reload
-; FAST-NEXT: lfd f14, 336(r1) # 8-byte Folded Reload
-; FAST-NEXT: lxvd2x v31, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 288
-; FAST-NEXT: ld r30, 320(r1) # 8-byte Folded Reload
-; FAST-NEXT: lxvd2x v30, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 272
-; FAST-NEXT: lxvd2x v29, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 256
-; FAST-NEXT: lxvd2x v28, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 240
-; FAST-NEXT: lxvd2x v27, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 224
-; FAST-NEXT: lxvd2x v26, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 208
-; FAST-NEXT: lxvd2x v25, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 192
-; FAST-NEXT: lxvd2x v24, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 176
-; FAST-NEXT: lxvd2x v23, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 160
-; FAST-NEXT: lxvd2x v22, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 144
-; FAST-NEXT: lxvd2x v21, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 128
-; FAST-NEXT: lxvd2x v20, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: addi r1, r1, 480
-; FAST-NEXT: ld r0, 16(r1)
-; FAST-NEXT: mtlr r0
-; FAST-NEXT: blr
%a = call <32 x i64> @llvm.lrint.v32i64.v32f16(<32 x half> %x)
ret <32 x i64> %a
}
@@ -3400,12 +2423,6 @@ define <1 x i64> @lrint_v1f32(<1 x float> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: lrint_v1f32:
-; FAST: # %bb.0:
-; FAST-NEXT: fctid f0, f1
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: blr
%a = call <1 x i64> @llvm.lrint.v1i64.v1f32(<1 x float> %x)
ret <1 x i64> %a
}
@@ -3459,21 +2476,6 @@ define <2 x i64> @lrint_v2f32(<2 x float> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: lrint_v2f32:
-; FAST: # %bb.0:
-; FAST-NEXT: xxsldwi vs0, v2, v2, 3
-; FAST-NEXT: xxswapd vs1, v2
-; FAST-NEXT: xscvspdpn f0, vs0
-; FAST-NEXT: xscvspdpn f1, vs1
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: fctid f1, f1
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: mffprd r3, f1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: xxmrghd v2, vs1, vs0
-; FAST-NEXT: blr
%a = call <2 x i64> @llvm.lrint.v2i64.v2f32(<2 x float> %x)
ret <2 x i64> %a
}
@@ -3552,32 +2554,6 @@ define <4 x i64> @lrint_v4f32(<4 x float> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: lrint_v4f32:
-; FAST: # %bb.0:
-; FAST-NEXT: xxsldwi vs0, v2, v2, 3
-; FAST-NEXT: xxswapd vs1, v2
-; FAST-NEXT: xscvspdpn f0, vs0
-; FAST-NEXT: xxsldwi vs2, v2, v2, 1
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: xscvspdpn f0, vs1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v4, vs0, vs1
-; FAST-NEXT: xscvspdpn f0, v2
-; FAST-NEXT: vmr v2, v4
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: xscvspdpn f0, vs2
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v3, vs1, vs0
-; FAST-NEXT: blr
%a = call <4 x i64> @llvm.lrint.v4i64.v4f32(<4 x float> %x)
ret <4 x i64> %a
}
@@ -3710,54 +2686,6 @@ define <8 x i64> @lrint_v8f32(<8 x float> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: lrint_v8f32:
-; FAST: # %bb.0:
-; FAST-NEXT: xxsldwi vs0, v2, v2, 3
-; FAST-NEXT: xxswapd vs1, v2
-; FAST-NEXT: xscvspdpn f0, vs0
-; FAST-NEXT: xxsldwi vs2, v2, v2, 1
-; FAST-NEXT: xxsldwi vs3, v3, v3, 3
-; FAST-NEXT: xxswapd vs4, v3
-; FAST-NEXT: xxsldwi vs5, v3, v3, 1
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: xscvspdpn f0, vs1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v0, vs0, vs1
-; FAST-NEXT: xscvspdpn f0, v2
-; FAST-NEXT: vmr v2, v0
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: xscvspdpn f0, vs2
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v1, vs1, vs0
-; FAST-NEXT: xscvspdpn f0, vs3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: xscvspdpn f0, vs4
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v4, vs0, vs1
-; FAST-NEXT: xscvspdpn f0, v3
-; FAST-NEXT: vmr v3, v1
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: xscvspdpn f0, vs5
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v5, vs1, vs0
-; FAST-NEXT: blr
%a = call <8 x i64> @llvm.lrint.v8i64.v8f32(<8 x float> %x)
ret <8 x i64> %a
}
@@ -3998,98 +2926,6 @@ define <16 x i64> @lrint_v16i64_v16f32(<16 x float> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: lrint_v16i64_v16f32:
-; FAST: # %bb.0:
-; FAST-NEXT: xxsldwi vs0, v2, v2, 3
-; FAST-NEXT: xxswapd vs1, v2
-; FAST-NEXT: xscvspdpn f0, vs0
-; FAST-NEXT: xxsldwi vs2, v2, v2, 1
-; FAST-NEXT: xxsldwi vs3, v3, v3, 3
-; FAST-NEXT: xxswapd vs4, v3
-; FAST-NEXT: xxsldwi vs5, v3, v3, 1
-; FAST-NEXT: xxsldwi vs6, v4, v4, 3
-; FAST-NEXT: xxswapd vs7, v4
-; FAST-NEXT: xxsldwi vs8, v4, v4, 1
-; FAST-NEXT: xxsldwi vs9, v5, v5, 3
-; FAST-NEXT: xxswapd vs10, v5
-; FAST-NEXT: xxsldwi vs11, v5, v5, 1
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: xscvspdpn f0, vs1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v0, vs0, vs1
-; FAST-NEXT: xscvspdpn f0, v2
-; FAST-NEXT: vmr v2, v0
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: xscvspdpn f0, vs2
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v1, vs1, vs0
-; FAST-NEXT: xscvspdpn f0, vs3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: xscvspdpn f0, vs4
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v10, vs0, vs1
-; FAST-NEXT: xscvspdpn f0, v3
-; FAST-NEXT: vmr v3, v1
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: xscvspdpn f0, vs5
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v11, vs1, vs0
-; FAST-NEXT: xscvspdpn f0, vs6
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: xscvspdpn f0, vs7
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v6, vs0, vs1
-; FAST-NEXT: xscvspdpn f0, v4
-; FAST-NEXT: xscvspdpn f1, vs8
-; FAST-NEXT: vmr v4, v10
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: fctid f1, f1
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: mffprd r3, f1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: xxmrghd v7, vs0, vs1
-; FAST-NEXT: xscvspdpn f0, vs9
-; FAST-NEXT: xscvspdpn f1, vs10
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: fctid f1, f1
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: mffprd r3, f1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: xxmrghd v8, vs1, vs0
-; FAST-NEXT: xscvspdpn f0, v5
-; FAST-NEXT: xscvspdpn f1, vs11
-; FAST-NEXT: vmr v5, v11
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: fctid f1, f1
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: mffprd r3, f1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: xxmrghd v9, vs0, vs1
-; FAST-NEXT: blr
%a = call <16 x i64> @llvm.lrint.v16i64.v16f32(<16 x float> %x)
ret <16 x i64> %a
}
@@ -4119,12 +2955,6 @@ define <1 x i64> @lrint_v1f64(<1 x double> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: lrint_v1f64:
-; FAST: # %bb.0:
-; FAST-NEXT: fctid f0, f1
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: blr
%a = call <1 x i64> @llvm.lrint.v1i64.v1f64(<1 x double> %x)
ret <1 x i64> %a
}
@@ -4179,19 +3009,6 @@ define <2 x i64> @lrint_v2f64(<2 x double> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: lrint_v2f64:
-; FAST: # %bb.0:
-; FAST-NEXT: xxlor f1, v2, v2
-; FAST-NEXT: xxswapd vs0, v2
-; FAST-NEXT: fctid f1, f1
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r3, f1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: mffprd r3, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v2, vs1, vs0
-; FAST-NEXT: blr
%a = call <2 x i64> @llvm.lrint.v2i64.v2f64(<2 x double> %x)
ret <2 x i64> %a
}
@@ -4276,28 +3093,6 @@ define <4 x i64> @lrint_v4f64(<4 x double> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: lrint_v4f64:
-; FAST: # %bb.0:
-; FAST-NEXT: xxswapd vs0, v2
-; FAST-NEXT: xxlor f2, v2, v2
-; FAST-NEXT: xxswapd vs1, v3
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: fctid f2, f2
-; FAST-NEXT: fctid f1, f1
-; FAST-NEXT: mffprd r4, f0
-; FAST-NEXT: xxlor f0, v3, v3
-; FAST-NEXT: mffprd r3, f2
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mtfprd f2, r4
-; FAST-NEXT: mffprd r5, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: mffprd r3, f1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: xxmrghd v2, vs0, vs2
-; FAST-NEXT: mtfprd f0, r5
-; FAST-NEXT: xxmrghd v3, vs0, vs1
-; FAST-NEXT: blr
%a = call <4 x i64> @llvm.lrint.v4i64.v4f64(<4 x double> %x)
ret <4 x i64> %a
}
@@ -4442,46 +3237,6 @@ define <8 x i64> @lrint_v8f64(<8 x double> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: lrint_v8f64:
-; FAST: # %bb.0:
-; FAST-NEXT: xxswapd vs0, v2
-; FAST-NEXT: xxswapd vs1, v3
-; FAST-NEXT: xxlor f4, v2, v2
-; FAST-NEXT: xxswapd vs2, v4
-; FAST-NEXT: xxswapd vs3, v5
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: fctid f4, f4
-; FAST-NEXT: mffprd r4, f0
-; FAST-NEXT: xxlor f0, v3, v3
-; FAST-NEXT: mffprd r3, f4
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mffprd r5, f0
-; FAST-NEXT: fctid f0, f1
-; FAST-NEXT: mtfprd f1, r4
-; FAST-NEXT: mffprd r6, f0
-; FAST-NEXT: xxlor f0, v4, v4
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mtfprd f4, r6
-; FAST-NEXT: mffprd r7, f0
-; FAST-NEXT: fctid f0, f2
-; FAST-NEXT: mtfprd f2, r5
-; FAST-NEXT: mtfprd f5, r7
-; FAST-NEXT: mffprd r8, f0
-; FAST-NEXT: xxlor f0, v5, v5
-; FAST-NEXT: fctid f0, f0
-; FAST-NEXT: mtfprd f6, r8
-; FAST-NEXT: mffprd r9, f0
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: xxmrghd v3, vs2, vs4
-; FAST-NEXT: xxmrghd v4, vs5, vs6
-; FAST-NEXT: xxmrghd v2, vs0, vs1
-; FAST-NEXT: fctid f1, f3
-; FAST-NEXT: mtfprd f0, r9
-; FAST-NEXT: mffprd r3, f1
-; FAST-NEXT: mtfprd f1, r3
-; FAST-NEXT: xxmrghd v5, vs0, vs1
-; FAST-NEXT: blr
%a = call <8 x i64> @llvm.lrint.v8i64.v8f64(<8 x double> %x)
ret <8 x i64> %a
}
@@ -4511,18 +3266,6 @@ define <1 x i64> @lrint_v1f128(<1 x fp128> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: lrint_v1f128:
-; FAST: # %bb.0:
-; FAST-NEXT: mflr r0
-; FAST-NEXT: stdu r1, -32(r1)
-; FAST-NEXT: std r0, 48(r1)
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: addi r1, r1, 32
-; FAST-NEXT: ld r0, 16(r1)
-; FAST-NEXT: mtlr r0
-; FAST-NEXT: blr
%a = call <1 x i64> @llvm.lrint.v1i64.v1f128(<1 x fp128> %x)
ret <1 x i64> %a
}
@@ -4580,33 +3323,6 @@ define <2 x i64> @lrint_v2f128(<2 x fp128> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: lrint_v2f128:
-; FAST: # %bb.0:
-; FAST-NEXT: mflr r0
-; FAST-NEXT: stdu r1, -80(r1)
-; FAST-NEXT: li r3, 48
-; FAST-NEXT: std r0, 96(r1)
-; FAST-NEXT: stxvd2x v30, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 64
-; FAST-NEXT: stxvd2x v31, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: vmr v31, v3
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: vmr v2, v31
-; FAST-NEXT: mtvsrd v30, r3
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: li r3, 64
-; FAST-NEXT: lxvd2x v31, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 48
-; FAST-NEXT: xxmrghd v2, vs0, v30
-; FAST-NEXT: lxvd2x v30, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: addi r1, r1, 80
-; FAST-NEXT: ld r0, 16(r1)
-; FAST-NEXT: mtlr r0
-; FAST-NEXT: blr
%a = call <2 x i64> @llvm.lrint.v2i64.v2f128(<2 x fp128> %x)
ret <2 x i64> %a
}
@@ -4704,53 +3420,6 @@ define <4 x i64> @lrint_v4f128(<4 x fp128> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: lrint_v4f128:
-; FAST: # %bb.0:
-; FAST-NEXT: mflr r0
-; FAST-NEXT: stdu r1, -112(r1)
-; FAST-NEXT: li r3, 48
-; FAST-NEXT: std r0, 128(r1)
-; FAST-NEXT: stxvd2x v28, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 64
-; FAST-NEXT: stxvd2x v29, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 80
-; FAST-NEXT: vmr v29, v3
-; FAST-NEXT: stxvd2x v30, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 96
-; FAST-NEXT: vmr v30, v4
-; FAST-NEXT: stxvd2x v31, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: vmr v31, v5
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: vmr v2, v29
-; FAST-NEXT: mtvsrd v28, r3
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: vmr v2, v30
-; FAST-NEXT: xxmrghd v29, vs0, v28
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: vmr v2, v31
-; FAST-NEXT: mtvsrd v30, r3
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: li r3, 96
-; FAST-NEXT: vmr v2, v29
-; FAST-NEXT: lxvd2x v31, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 80
-; FAST-NEXT: xxmrghd v3, vs0, v30
-; FAST-NEXT: lxvd2x v30, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 64
-; FAST-NEXT: lxvd2x v29, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 48
-; FAST-NEXT: lxvd2x v28, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: addi r1, r1, 112
-; FAST-NEXT: ld r0, 16(r1)
-; FAST-NEXT: mtlr r0
-; FAST-NEXT: blr
%a = call <4 x i64> @llvm.lrint.v4i64.v4f128(<4 x fp128> %x)
ret <4 x i64> %a
}
@@ -4928,93 +3597,6 @@ define <8 x i64> @lrint_v8f128(<8 x fp128> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: lrint_v8f128:
-; FAST: # %bb.0:
-; FAST-NEXT: mflr r0
-; FAST-NEXT: stdu r1, -176(r1)
-; FAST-NEXT: li r3, 48
-; FAST-NEXT: std r0, 192(r1)
-; FAST-NEXT: stxvd2x v24, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 64
-; FAST-NEXT: stxvd2x v25, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 80
-; FAST-NEXT: vmr v25, v3
-; FAST-NEXT: stxvd2x v26, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 96
-; FAST-NEXT: vmr v26, v4
-; FAST-NEXT: stxvd2x v27, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 112
-; FAST-NEXT: vmr v27, v5
-; FAST-NEXT: stxvd2x v28, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 128
-; FAST-NEXT: vmr v28, v6
-; FAST-NEXT: stxvd2x v29, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 144
-; FAST-NEXT: vmr v29, v7
-; FAST-NEXT: stxvd2x v30, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 160
-; FAST-NEXT: vmr v30, v8
-; FAST-NEXT: stxvd2x v31, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: vmr v31, v9
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: vmr v2, v25
-; FAST-NEXT: mtvsrd v24, r3
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: vmr v2, v26
-; FAST-NEXT: xxmrghd v25, vs0, v24
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: vmr v2, v27
-; FAST-NEXT: mtvsrd v26, r3
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: vmr v2, v28
-; FAST-NEXT: xxmrghd v27, vs0, v26
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: vmr v2, v29
-; FAST-NEXT: mtvsrd v28, r3
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: vmr v2, v30
-; FAST-NEXT: xxmrghd v29, vs0, v28
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: vmr v2, v31
-; FAST-NEXT: mtvsrd v30, r3
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: li r3, 160
-; FAST-NEXT: vmr v4, v29
-; FAST-NEXT: lxvd2x v31, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 144
-; FAST-NEXT: vmr v3, v27
-; FAST-NEXT: vmr v2, v25
-; FAST-NEXT: xxmrghd v5, vs0, v30
-; FAST-NEXT: lxvd2x v30, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 128
-; FAST-NEXT: lxvd2x v29, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 112
-; FAST-NEXT: lxvd2x v28, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 96
-; FAST-NEXT: lxvd2x v27, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 80
-; FAST-NEXT: lxvd2x v26, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 64
-; FAST-NEXT: lxvd2x v25, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 48
-; FAST-NEXT: lxvd2x v24, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: addi r1, r1, 176
-; FAST-NEXT: ld r0, 16(r1)
-; FAST-NEXT: mtlr r0
-; FAST-NEXT: blr
%a = call <8 x i64> @llvm.lrint.v8i64.v8f128(<8 x fp128> %x)
ret <8 x i64> %a
}
@@ -5355,176 +3937,6 @@ define <16 x i64> @lrint_v16i64_v16f128(<16 x fp128> %x) nounwind {
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
-;
-; FAST-LABEL: lrint_v16i64_v16f128:
-; FAST: # %bb.0:
-; FAST-NEXT: mflr r0
-; FAST-NEXT: stdu r1, -304(r1)
-; FAST-NEXT: li r3, 112
-; FAST-NEXT: std r0, 320(r1)
-; FAST-NEXT: stxvd2x v20, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 128
-; FAST-NEXT: stxvd2x v21, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 144
-; FAST-NEXT: vmr v21, v4
-; FAST-NEXT: stxvd2x v22, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 160
-; FAST-NEXT: vmr v22, v6
-; FAST-NEXT: stxvd2x v23, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 176
-; FAST-NEXT: vmr v23, v8
-; FAST-NEXT: stxvd2x v24, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 192
-; FAST-NEXT: vmr v24, v9
-; FAST-NEXT: stxvd2x v25, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 208
-; FAST-NEXT: vmr v25, v7
-; FAST-NEXT: stxvd2x v26, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 224
-; FAST-NEXT: vmr v26, v10
-; FAST-NEXT: stxvd2x v27, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 240
-; FAST-NEXT: vmr v27, v5
-; FAST-NEXT: stxvd2x v28, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 256
-; FAST-NEXT: vmr v28, v11
-; FAST-NEXT: stxvd2x v29, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 272
-; FAST-NEXT: vmr v29, v12
-; FAST-NEXT: stxvd2x v30, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 288
-; FAST-NEXT: vmr v30, v3
-; FAST-NEXT: stxvd2x v31, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 64
-; FAST-NEXT: stxvd2x v13, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: addi r3, r1, 576
-; FAST-NEXT: lxvd2x vs0, 0, r3
-; FAST-NEXT: addi r3, r1, 560
-; FAST-NEXT: lxvd2x vs1, 0, r3
-; FAST-NEXT: addi r3, r1, 544
-; FAST-NEXT: lxvd2x vs2, 0, r3
-; FAST-NEXT: li r3, 96
-; FAST-NEXT: xxswapd vs0, vs0
-; FAST-NEXT: stxvd2x vs0, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 80
-; FAST-NEXT: xxswapd vs0, vs1
-; FAST-NEXT: stxvd2x vs0, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: li r3, 48
-; FAST-NEXT: xxswapd vs0, vs2
-; FAST-NEXT: stxvd2x vs0, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT: addi r3, r1, 528
-; FAST-NEXT: lxvd2x vs0, 0, r3
-; FAST-NEXT: xxswapd v31, vs0
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: vmr v2, v30
-; FAST-NEXT: mtvsrd v20, r3
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: vmr v2, v21
-; FAST-NEXT: xxmrghd v30, vs0, v20
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: vmr v2, v27
-; FAST-NEXT: mtvsrd v21, r3
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: vmr v2, v22
-; FAST-NEXT: xxmrghd v27, vs0, v21
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: vmr v2, v25
-; FAST-NEXT: mtvsrd v22, r3
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: vmr v2, v23
-; FAST-NEXT: xxmrghd v25, vs0, v22
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: vmr v2, v24
-; FAST-NEXT: mtvsrd v23, r3
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: vmr v2, v26
-; FAST-NEXT: xxmrghd v24, vs0, v23
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: vmr v2, v28
-; FAST-NEXT: mtvsrd v26, r3
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: vmr v2, v29
-; FAST-NEXT: xxmrghd v28, vs0, v26
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: mtvsrd v29, r3
-; FAST-NEXT: li r3, 64
-; FAST-NEXT: lxvd2x v2, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: vmr v2, v31
-; FAST-NEXT: xxmrghd v29, vs0, v29
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: mtvsrd v31, r3
-; FAST-NEXT: li r3, 48
-; FAST-NEXT: lxvd2x v2, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: li r3, 80
-; FAST-NEXT: lxvd2x v2, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: xxmrghd v31, vs0, v31
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: mtvsrd v26, r3
-; FAST-NEXT: li r3, 96
-; FAST-NEXT: lxvd2x v2, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: bl lrintf128
-; FAST-NEXT: nop
-; FAST-NEXT: mtfprd f0, r3
-; FAST-NEXT: li r3, 288
-; FAST-NEXT: vmr v8, v31
-; FAST-NEXT: lxvd2x v31, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 272
-; FAST-NEXT: vmr v2, v30
-; FAST-NEXT: vmr v7, v29
-; FAST-NEXT: vmr v6, v28
-; FAST-NEXT: vmr v3, v27
-; FAST-NEXT: lxvd2x v30, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 256
-; FAST-NEXT: vmr v4, v25
-; FAST-NEXT: vmr v5, v24
-; FAST-NEXT: lxvd2x v29, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 240
-; FAST-NEXT: lxvd2x v28, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 224
-; FAST-NEXT: lxvd2x v27, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 208
-; FAST-NEXT: xxmrghd v9, vs0, v26
-; FAST-NEXT: lxvd2x v26, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 192
-; FAST-NEXT: lxvd2x v25, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 176
-; FAST-NEXT: lxvd2x v24, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 160
-; FAST-NEXT: lxvd2x v23, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 144
-; FAST-NEXT: lxvd2x v22, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 128
-; FAST-NEXT: lxvd2x v21, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: li r3, 112
-; FAST-NEXT: lxvd2x v20, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT: addi r1, r1, 304
-; FAST-NEXT: ld r0, 16(r1)
-; FAST-NEXT: mtlr r0
-; FAST-NEXT: blr
%a = call <16 x i64> @llvm.lrint.v16i64.v16f128(<16 x fp128> %x)
ret <16 x i64> %a
}
diff --git a/llvm/test/CodeGen/RISCV/rv32zbs.ll b/llvm/test/CodeGen/RISCV/rv32zbs.ll
index dcb70f8..f9527ef 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbs.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbs.ll
@@ -45,6 +45,32 @@ define i32 @bclr_i32_no_mask(i32 %a, i32 %b) nounwind {
ret i32 %and1
}
+define i32 @bclr_i32_mask_multiple(i32 %a, i32 %b, i32 %shamt) nounwind {
+; RV32I-LABEL: bclr_i32_mask_multiple:
+; RV32I: # %bb.0:
+; RV32I-NEXT: li a3, 1
+; RV32I-NEXT: sll a2, a3, a2
+; RV32I-NEXT: not a3, a2
+; RV32I-NEXT: and a0, a3, a0
+; RV32I-NEXT: or a1, a1, a2
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: ret
+;
+; RV32ZBS-LABEL: bclr_i32_mask_multiple:
+; RV32ZBS: # %bb.0:
+; RV32ZBS-NEXT: bclr a0, a0, a2
+; RV32ZBS-NEXT: bset a1, a1, a2
+; RV32ZBS-NEXT: add a0, a0, a1
+; RV32ZBS-NEXT: ret
+ %shamt_masked = and i32 %shamt, 63
+ %shl = shl nuw i32 1, %shamt_masked
+ %neg = xor i32 %shl, -1
+ %and = and i32 %neg, %a
+ %or = or i32 %b, %shl
+ %c = add i32 %and, %or
+ ret i32 %c
+}
+
define i64 @bclr_i64(i64 %a, i64 %b) nounwind {
; RV32I-LABEL: bclr_i64:
; RV32I: # %bb.0:
@@ -301,17 +327,17 @@ define i64 @bext_i64(i64 %a, i64 %b) nounwind {
; CHECK: # %bb.0:
; CHECK-NEXT: andi a3, a2, 63
; CHECK-NEXT: addi a4, a3, -32
-; CHECK-NEXT: bltz a4, .LBB12_2
+; CHECK-NEXT: bltz a4, .LBB13_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: srl a0, a1, a3
-; CHECK-NEXT: j .LBB12_3
-; CHECK-NEXT: .LBB12_2:
+; CHECK-NEXT: j .LBB13_3
+; CHECK-NEXT: .LBB13_2:
; CHECK-NEXT: srl a0, a0, a2
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: not a2, a3
; CHECK-NEXT: sll a1, a1, a2
; CHECK-NEXT: or a0, a0, a1
-; CHECK-NEXT: .LBB12_3:
+; CHECK-NEXT: .LBB13_3:
; CHECK-NEXT: andi a0, a0, 1
; CHECK-NEXT: li a1, 0
; CHECK-NEXT: ret
@@ -789,17 +815,17 @@ define i64 @bset_trailing_ones_i64_mask(i64 %a) nounwind {
; CHECK-NEXT: li a3, -1
; CHECK-NEXT: addi a1, a2, -32
; CHECK-NEXT: sll a0, a3, a0
-; CHECK-NEXT: bltz a1, .LBB43_2
+; CHECK-NEXT: bltz a1, .LBB44_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: sll a2, a3, a2
-; CHECK-NEXT: j .LBB43_3
-; CHECK-NEXT: .LBB43_2:
+; CHECK-NEXT: j .LBB44_3
+; CHECK-NEXT: .LBB44_2:
; CHECK-NEXT: not a2, a2
; CHECK-NEXT: lui a3, 524288
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: srl a2, a3, a2
; CHECK-NEXT: or a2, a0, a2
-; CHECK-NEXT: .LBB43_3:
+; CHECK-NEXT: .LBB44_3:
; CHECK-NEXT: srai a1, a1, 31
; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: not a1, a2
@@ -817,17 +843,17 @@ define i64 @bset_trailing_ones_i64_no_mask(i64 %a) nounwind {
; CHECK-NEXT: li a1, -1
; CHECK-NEXT: addi a2, a0, -32
; CHECK-NEXT: sll a1, a1, a0
-; CHECK-NEXT: bltz a2, .LBB44_2
+; CHECK-NEXT: bltz a2, .LBB45_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: j .LBB44_3
-; CHECK-NEXT: .LBB44_2:
+; CHECK-NEXT: j .LBB45_3
+; CHECK-NEXT: .LBB45_2:
; CHECK-NEXT: not a0, a0
; CHECK-NEXT: lui a3, 524288
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: srl a0, a3, a0
; CHECK-NEXT: or a0, a1, a0
-; CHECK-NEXT: .LBB44_3:
+; CHECK-NEXT: .LBB45_3:
; CHECK-NEXT: srai a2, a2, 31
; CHECK-NEXT: and a2, a2, a1
; CHECK-NEXT: not a1, a0
diff --git a/llvm/test/CodeGen/RISCV/rv64zbs.ll b/llvm/test/CodeGen/RISCV/rv64zbs.ll
index b4edcf6c..d42bc8e 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbs.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbs.ll
@@ -110,6 +110,32 @@ define i64 @bclr_i64_no_mask(i64 %a, i64 %b) nounwind {
ret i64 %and1
}
+define i64 @bclr_i64_mask_multiple(i64 %a, i64 %b, i64 %shamt) nounwind {
+; RV64I-LABEL: bclr_i64_mask_multiple:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a3, 1
+; RV64I-NEXT: sll a2, a3, a2
+; RV64I-NEXT: not a3, a2
+; RV64I-NEXT: and a0, a3, a0
+; RV64I-NEXT: or a1, a1, a2
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBS-LABEL: bclr_i64_mask_multiple:
+; RV64ZBS: # %bb.0:
+; RV64ZBS-NEXT: bclr a0, a0, a2
+; RV64ZBS-NEXT: bset a1, a1, a2
+; RV64ZBS-NEXT: add a0, a0, a1
+; RV64ZBS-NEXT: ret
+ %shamt_masked = and i64 %shamt, 63
+ %shl = shl nuw i64 1, %shamt_masked
+ %neg = xor i64 %shl, -1
+ %and = and i64 %neg, %a
+ %or = or i64 %b, %shl
+ %c = add i64 %and, %or
+ ret i64 %c
+}
+
define signext i32 @bset_i32(i32 signext %a, i32 signext %b) nounwind {
; RV64I-LABEL: bset_i32:
; RV64I: # %bb.0:
@@ -372,19 +398,19 @@ define void @bext_i32_trunc(i32 signext %0, i32 signext %1) {
; RV64I: # %bb.0:
; RV64I-NEXT: srlw a0, a0, a1
; RV64I-NEXT: andi a0, a0, 1
-; RV64I-NEXT: beqz a0, .LBB19_2
+; RV64I-NEXT: beqz a0, .LBB20_2
; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: ret
-; RV64I-NEXT: .LBB19_2:
+; RV64I-NEXT: .LBB20_2:
; RV64I-NEXT: tail bar
;
; RV64ZBS-LABEL: bext_i32_trunc:
; RV64ZBS: # %bb.0:
; RV64ZBS-NEXT: bext a0, a0, a1
-; RV64ZBS-NEXT: beqz a0, .LBB19_2
+; RV64ZBS-NEXT: beqz a0, .LBB20_2
; RV64ZBS-NEXT: # %bb.1:
; RV64ZBS-NEXT: ret
-; RV64ZBS-NEXT: .LBB19_2:
+; RV64ZBS-NEXT: .LBB20_2:
; RV64ZBS-NEXT: tail bar
%3 = shl i32 1, %1
%4 = and i32 %3, %0
diff --git a/llvm/test/CodeGen/SystemZ/int-conv-14.ll b/llvm/test/CodeGen/SystemZ/int-conv-14.ll
index 98dc88f..baab5ac 100644
--- a/llvm/test/CodeGen/SystemZ/int-conv-14.ll
+++ b/llvm/test/CodeGen/SystemZ/int-conv-14.ll
@@ -55,14 +55,15 @@ define i128 @f4(ptr %ptr) {
}
; Truncation to i64.
-define i64 @f5(i128 %a) {
+define i64 @f5(i128 %a, i128 %b) {
; CHECK-LABEL: f5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v0, 0(%r2), 3
-; CHECK-NEXT: vaq %v0, %v0, %v0
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 0(%r2), 3
+; CHECK-NEXT: vaq %v0, %v1, %v0
; CHECK-NEXT: vlgvg %r2, %v0, 1
; CHECK-NEXT: br %r14
- %op = add i128 %a, %a
+ %op = add i128 %a, %b
%res = trunc i128 %op to i64
ret i64 %res
}
@@ -134,15 +135,16 @@ define i128 @f10(ptr %ptr) {
}
; Truncation to i32.
-define i32 @f11(i128 %a) {
+define i32 @f11(i128 %a, i128 %b) {
; CHECK-LABEL: f11:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v0, 0(%r2), 3
-; CHECK-NEXT: vaq %v0, %v0, %v0
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 0(%r2), 3
+; CHECK-NEXT: vaq %v0, %v1, %v0
; CHECK-NEXT: vlgvf %r2, %v0, 3
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
; CHECK-NEXT: br %r14
- %op = add i128 %a, %a
+ %op = add i128 %a, %b
%res = trunc i128 %op to i32
ret i32 %res
}
@@ -215,15 +217,16 @@ define i128 @f16(ptr %ptr) {
}
; Truncation to i16.
-define i16 @f17(i128 %a) {
+define i16 @f17(i128 %a, i128 %b) {
; CHECK-LABEL: f17:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v0, 0(%r2), 3
-; CHECK-NEXT: vaq %v0, %v0, %v0
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 0(%r2), 3
+; CHECK-NEXT: vaq %v0, %v1, %v0
; CHECK-NEXT: vlgvf %r2, %v0, 3
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
; CHECK-NEXT: br %r14
- %op = add i128 %a, %a
+ %op = add i128 %a, %b
%res = trunc i128 %op to i16
ret i16 %res
}
@@ -296,15 +299,16 @@ define i128 @f22(ptr %ptr) {
}
; Truncation to i8.
-define i8 @f23(i128 %a) {
+define i8 @f23(i128 %a, i128 %b) {
; CHECK-LABEL: f23:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v0, 0(%r2), 3
-; CHECK-NEXT: vaq %v0, %v0, %v0
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 0(%r2), 3
+; CHECK-NEXT: vaq %v0, %v1, %v0
; CHECK-NEXT: vlgvf %r2, %v0, 3
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
; CHECK-NEXT: br %r14
- %op = add i128 %a, %a
+ %op = add i128 %a, %b
%res = trunc i128 %op to i8
ret i8 %res
}
@@ -385,15 +389,16 @@ define i128 @f28(ptr %ptr) {
}
; Truncation to i1.
-define i1 @f29(i128 %a) {
+define i1 @f29(i128 %a, i128 %b) {
; CHECK-LABEL: f29:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v0, 0(%r2), 3
-; CHECK-NEXT: vaq %v0, %v0, %v0
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 0(%r2), 3
+; CHECK-NEXT: vaq %v0, %v1, %v0
; CHECK-NEXT: vlgvf %r2, %v0, 3
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
; CHECK-NEXT: br %r14
- %op = add i128 %a, %a
+ %op = add i128 %a, %b
%res = trunc i128 %op to i1
ret i1 %res
}
diff --git a/llvm/test/CodeGen/SystemZ/int-conv-15.ll b/llvm/test/CodeGen/SystemZ/int-conv-15.ll
index 0d8ee75..f2c9ee5 100644
--- a/llvm/test/CodeGen/SystemZ/int-conv-15.ll
+++ b/llvm/test/CodeGen/SystemZ/int-conv-15.ll
@@ -55,14 +55,15 @@ define i128 @f4(ptr %ptr) {
}
; Truncation to i64.
-define i64 @f5(i128 %a) {
+define i64 @f5(i128 %a, i128 %b) {
; CHECK-LABEL: f5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v0, 0(%r2), 3
-; CHECK-NEXT: vaq %v0, %v0, %v0
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 0(%r2), 3
+; CHECK-NEXT: vaq %v0, %v1, %v0
; CHECK-NEXT: vlgvg %r2, %v0, 1
; CHECK-NEXT: br %r14
- %op = add i128 %a, %a
+ %op = add i128 %a, %b
%res = trunc i128 %op to i64
ret i64 %res
}
@@ -134,15 +135,16 @@ define i128 @f10(ptr %ptr) {
}
; Truncation to i32.
-define i32 @f11(i128 %a) {
+define i32 @f11(i128 %a, i128 %b) {
; CHECK-LABEL: f11:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v0, 0(%r2), 3
-; CHECK-NEXT: vaq %v0, %v0, %v0
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 0(%r2), 3
+; CHECK-NEXT: vaq %v0, %v1, %v0
; CHECK-NEXT: vlgvf %r2, %v0, 3
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
; CHECK-NEXT: br %r14
- %op = add i128 %a, %a
+ %op = add i128 %a, %b
%res = trunc i128 %op to i32
ret i32 %res
}
@@ -215,15 +217,16 @@ define i128 @f16(ptr %ptr) {
}
; Truncation to i16.
-define i16 @f17(i128 %a) {
+define i16 @f17(i128 %a, i128 %b) {
; CHECK-LABEL: f17:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v0, 0(%r2), 3
-; CHECK-NEXT: vaq %v0, %v0, %v0
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 0(%r2), 3
+; CHECK-NEXT: vaq %v0, %v1, %v0
; CHECK-NEXT: vlgvf %r2, %v0, 3
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
; CHECK-NEXT: br %r14
- %op = add i128 %a, %a
+ %op = add i128 %a, %b
%res = trunc i128 %op to i16
ret i16 %res
}
@@ -296,15 +299,16 @@ define i128 @f22(ptr %ptr) {
}
; Truncation to i8.
-define i8 @f23(i128 %a) {
+define i8 @f23(i128 %a, i128 %b) {
; CHECK-LABEL: f23:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v0, 0(%r2), 3
-; CHECK-NEXT: vaq %v0, %v0, %v0
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 0(%r2), 3
+; CHECK-NEXT: vaq %v0, %v1, %v0
; CHECK-NEXT: vlgvf %r2, %v0, 3
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
; CHECK-NEXT: br %r14
- %op = add i128 %a, %a
+ %op = add i128 %a, %b
%res = trunc i128 %op to i8
ret i8 %res
}
@@ -383,15 +387,16 @@ define i128 @f28(ptr %ptr) {
}
; Truncation to i1.
-define i1 @f29(i128 %a) {
+define i1 @f29(i128 %a, i128 %b) {
; CHECK-LABEL: f29:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v0, 0(%r2), 3
-; CHECK-NEXT: vaq %v0, %v0, %v0
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 0(%r2), 3
+; CHECK-NEXT: vaq %v0, %v1, %v0
; CHECK-NEXT: vlgvf %r2, %v0, 3
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
; CHECK-NEXT: br %r14
- %op = add i128 %a, %a
+ %op = add i128 %a, %b
%res = trunc i128 %op to i1
ret i1 %res
}