aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/add-max.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/addsub64_carry.ll36
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll1260
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll135
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll210
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll585
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll90
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll90
-rw-r--r--llvm/test/CodeGen/AMDGPU/bf16.ll26
-rw-r--r--llvm/test/CodeGen/AMDGPU/carryout-selection.ll614
-rw-r--r--llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctpop16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll25
-rw-r--r--llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll35
-rw-r--r--llvm/test/CodeGen/AMDGPU/fdiv.f64.ll7
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmed3.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg-combines.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/fpext.f16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptosi.f16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptoui.f16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll128
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptrunc.ll44
-rw-r--r--llvm/test/CodeGen/AMDGPU/fract.f64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/fract.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/frem.ll11
-rw-r--r--llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll115
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll81
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll81
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll115
-rw-r--r--llvm/test/CodeGen/AMDGPU/inline-attr.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.add.min.max.ll191
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll35
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.exp2.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.log2.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll128
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll48
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll57
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll11
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll786
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll11
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll632
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll330
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll330
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll330
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll11
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll330
-rw-r--r--llvm/test/CodeGen/AMDGPU/minmax.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/optimize-compare.mir82
-rw-r--r--llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/s_cmp_0.ll64
-rw-r--r--llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir83
-rw-r--r--llvm/test/CodeGen/AMDGPU/sdiv64.ll146
-rw-r--r--llvm/test/CodeGen/AMDGPU/sitofp.f16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/srem.ll654
-rw-r--r--llvm/test/CodeGen/AMDGPU/srem64.ll207
-rw-r--r--llvm/test/CodeGen/AMDGPU/stackguard.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/uaddo.ll54
-rw-r--r--llvm/test/CodeGen/AMDGPU/udiv64.ll80
-rw-r--r--llvm/test/CodeGen/AMDGPU/uitofp.f16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/urem64.ll146
-rw-r--r--llvm/test/CodeGen/AMDGPU/usubo.ll54
-rw-r--r--llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/wave32.ll190
-rw-r--r--llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll8
80 files changed, 5006 insertions, 3925 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
index d9ac9a7..de1bb47 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck -check-prefix=GFX10 %s
# Test that we fold correct element from G_UNMERGE_VALUES into fma
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
index 52b1beb..91f2f6f1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX10
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX11
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX10
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX11
---
name: fract_f64_neg
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 5171403..7714c03 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -140,7 +140,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -345,7 +344,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index 7b01f13..7b81669 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -143,7 +143,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -348,7 +347,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
index 002c03aa..e86f747 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
@@ -551,7 +551,9 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 {
;
; GFX10CU-LABEL: name: workgroup_one_as_release
; GFX10CU: bb.0.entry:
+ ; GFX10CU-NEXT: S_WAITCNT_soft 16240
; GFX10CU-NEXT: S_WAITCNT_lds_direct
+ ; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_release
@@ -562,6 +564,8 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 {
;
; GFX11CU-LABEL: name: workgroup_one_as_release
; GFX11CU: bb.0.entry:
+ ; GFX11CU-NEXT: S_WAITCNT_soft 1015
+ ; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX11CU-NEXT: S_ENDPGM 0
entry:
fence syncscope("workgroup-one-as") release
@@ -587,7 +591,9 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 {
;
; GFX10CU-LABEL: name: workgroup_one_as_acq_rel
; GFX10CU: bb.0.entry:
+ ; GFX10CU-NEXT: S_WAITCNT_soft 16240
; GFX10CU-NEXT: S_WAITCNT_lds_direct
+ ; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_acq_rel
@@ -599,6 +605,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 {
;
; GFX11CU-LABEL: name: workgroup_one_as_acq_rel
; GFX11CU: bb.0.entry:
+ ; GFX11CU-NEXT: S_WAITCNT_soft 1015
+ ; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX11CU-NEXT: S_ENDPGM 0
entry:
fence syncscope("workgroup-one-as") acq_rel
@@ -624,7 +632,9 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 {
;
; GFX10CU-LABEL: name: workgroup_one_as_seq_cst
; GFX10CU: bb.0.entry:
+ ; GFX10CU-NEXT: S_WAITCNT_soft 16240
; GFX10CU-NEXT: S_WAITCNT_lds_direct
+ ; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_seq_cst
@@ -636,6 +646,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 {
;
; GFX11CU-LABEL: name: workgroup_one_as_seq_cst
; GFX11CU: bb.0.entry:
+ ; GFX11CU-NEXT: S_WAITCNT_soft 1015
+ ; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX11CU-NEXT: S_ENDPGM 0
entry:
fence syncscope("workgroup-one-as") seq_cst
@@ -1305,8 +1317,9 @@ define amdgpu_kernel void @workgroup_release() #0 {
;
; GFX10CU-LABEL: name: workgroup_release
; GFX10CU: bb.0.entry:
- ; GFX10CU-NEXT: S_WAITCNT_soft 49279
+ ; GFX10CU-NEXT: S_WAITCNT_soft 112
; GFX10CU-NEXT: S_WAITCNT_lds_direct
+ ; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_release
@@ -1317,7 +1330,8 @@ define amdgpu_kernel void @workgroup_release() #0 {
;
; GFX11CU-LABEL: name: workgroup_release
; GFX11CU: bb.0.entry:
- ; GFX11CU-NEXT: S_WAITCNT_soft 64519
+ ; GFX11CU-NEXT: S_WAITCNT_soft 7
+ ; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX11CU-NEXT: S_ENDPGM 0
entry:
fence syncscope("workgroup") release
@@ -1345,8 +1359,9 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
;
; GFX10CU-LABEL: name: workgroup_acq_rel
; GFX10CU: bb.0.entry:
- ; GFX10CU-NEXT: S_WAITCNT_soft 49279
+ ; GFX10CU-NEXT: S_WAITCNT_soft 112
; GFX10CU-NEXT: S_WAITCNT_lds_direct
+ ; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_acq_rel
@@ -1358,7 +1373,8 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
;
; GFX11CU-LABEL: name: workgroup_acq_rel
; GFX11CU: bb.0.entry:
- ; GFX11CU-NEXT: S_WAITCNT_soft 64519
+ ; GFX11CU-NEXT: S_WAITCNT_soft 7
+ ; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX11CU-NEXT: S_ENDPGM 0
entry:
fence syncscope("workgroup") acq_rel
@@ -1386,8 +1402,9 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
;
; GFX10CU-LABEL: name: workgroup_seq_cst
; GFX10CU: bb.0.entry:
- ; GFX10CU-NEXT: S_WAITCNT_soft 49279
+ ; GFX10CU-NEXT: S_WAITCNT_soft 112
; GFX10CU-NEXT: S_WAITCNT_lds_direct
+ ; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_seq_cst
@@ -1399,7 +1416,8 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
;
; GFX11CU-LABEL: name: workgroup_seq_cst
; GFX11CU: bb.0.entry:
- ; GFX11CU-NEXT: S_WAITCNT_soft 64519
+ ; GFX11CU-NEXT: S_WAITCNT_soft 7
+ ; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX11CU-NEXT: S_ENDPGM 0
entry:
fence syncscope("workgroup") seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/add-max.ll b/llvm/test/CodeGen/AMDGPU/add-max.ll
index 00c6656..b3a7057 100644
--- a/llvm/test/CodeGen/AMDGPU/add-max.ll
+++ b/llvm/test/CodeGen/AMDGPU/add-max.ll
@@ -5,7 +5,7 @@
define amdgpu_ps float @add_max_u32_vvv(i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: add_max_u32_vvv:
; GCN: ; %bb.0:
-; GCN-NEXT: v_add_max_u32_e64 v0, v0, v1, v2
+; GCN-NEXT: v_add_max_u32 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
%add = add i32 %a, %b
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
@@ -16,7 +16,7 @@ define amdgpu_ps float @add_max_u32_vvv(i32 %a, i32 %b, i32 %c) {
define amdgpu_ps float @add_max_u32_svv(i32 inreg %a, i32 %b, i32 %c) {
; GCN-LABEL: add_max_u32_svv:
; GCN: ; %bb.0:
-; GCN-NEXT: v_add_max_u32_e64 v0, s0, v0, v1
+; GCN-NEXT: v_add_max_u32 v0, s0, v0, v1
; GCN-NEXT: ; return to shader part epilog
%add = add i32 %a, %b
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
@@ -27,7 +27,7 @@ define amdgpu_ps float @add_max_u32_svv(i32 inreg %a, i32 %b, i32 %c) {
define amdgpu_ps float @add_max_u32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) {
; SDAG-LABEL: add_max_u32_ssv:
; SDAG: ; %bb.0:
-; SDAG-NEXT: v_add_max_u32_e64 v0, s0, s1, v0
+; SDAG-NEXT: v_add_max_u32 v0, s0, s1, v0
; SDAG-NEXT: ; return to shader part epilog
;
; GISEL-LABEL: add_max_u32_ssv:
@@ -59,7 +59,7 @@ define amdgpu_ps float @add_max_u32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c
define amdgpu_ps float @add_max_u32_vsi(i32 %a, i32 inreg %b) {
; GCN-LABEL: add_max_u32_vsi:
; GCN: ; %bb.0:
-; GCN-NEXT: v_add_max_u32_e64 v0, v0, s0, 4
+; GCN-NEXT: v_add_max_u32 v0, v0, s0, 4
; GCN-NEXT: ; return to shader part epilog
%add = add i32 %a, %b
%max = call i32 @llvm.umax.i32(i32 %add, i32 4)
@@ -70,7 +70,7 @@ define amdgpu_ps float @add_max_u32_vsi(i32 %a, i32 inreg %b) {
define amdgpu_ps float @add_max_u32_svl(i32 inreg %a, i32 %b) {
; GCN-LABEL: add_max_u32_svl:
; GCN: ; %bb.0:
-; GCN-NEXT: v_add_max_u32_e64 v0, s0, v0, 0x64
+; GCN-NEXT: v_add_max_u32 v0, s0, v0, 0x64
; GCN-NEXT: ; return to shader part epilog
%add = add i32 %a, %b
%max = call i32 @llvm.umax.i32(i32 %add, i32 100)
@@ -81,7 +81,7 @@ define amdgpu_ps float @add_max_u32_svl(i32 inreg %a, i32 %b) {
define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b) {
; SDAG-LABEL: add_max_u32_slv:
; SDAG: ; %bb.0:
-; SDAG-NEXT: v_add_max_u32_e64 v0, 0x64, s0, v0
+; SDAG-NEXT: v_add_max_u32 v0, 0x64, s0, v0
; SDAG-NEXT: ; return to shader part epilog
;
; GISEL-LABEL: add_max_u32_slv:
@@ -99,7 +99,7 @@ define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b) {
define amdgpu_ps float @add_max_i32_vvv(i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: add_max_i32_vvv:
; GCN: ; %bb.0:
-; GCN-NEXT: v_add_max_i32_e64 v0, v0, v1, v2
+; GCN-NEXT: v_add_max_i32 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
%add = add i32 %a, %b
%max = call i32 @llvm.smax.i32(i32 %add, i32 %c)
@@ -110,7 +110,7 @@ define amdgpu_ps float @add_max_i32_vvv(i32 %a, i32 %b, i32 %c) {
define amdgpu_ps float @add_min_u32_vvv(i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: add_min_u32_vvv:
; GCN: ; %bb.0:
-; GCN-NEXT: v_add_min_u32_e64 v0, v0, v1, v2
+; GCN-NEXT: v_add_min_u32 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
%add = add i32 %a, %b
%max = call i32 @llvm.umin.i32(i32 %add, i32 %c)
@@ -121,7 +121,7 @@ define amdgpu_ps float @add_min_u32_vvv(i32 %a, i32 %b, i32 %c) {
define amdgpu_ps float @add_min_i32_vvv(i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: add_min_i32_vvv:
; GCN: ; %bb.0:
-; GCN-NEXT: v_add_min_i32_e64 v0, v0, v1, v2
+; GCN-NEXT: v_add_min_i32 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
%add = add i32 %a, %b
%max = call i32 @llvm.smin.i32(i32 %add, i32 %c)
diff --git a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
index b72eba8..8088c1b 100644
--- a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
+++ b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
@@ -180,11 +180,7 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
; CHECK-LABEL: s_add64_32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s0, s0, s2
-; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
; CHECK-NEXT: s_addc_u32 s1, s1, s3
-; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_addc_u32 s2, s4, 0
; CHECK-NEXT: ; return to shader part epilog
%sum64 = add i64 %val64A, %val64B
@@ -199,14 +195,10 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_uadd_v2i64:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_add_u32 s10, s2, s6
-; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0
-; CHECK-NEXT: s_addc_u32 s8, s3, s7
+; CHECK-NEXT: s_add_u32 s6, s2, s6
+; CHECK-NEXT: s_addc_u32 s7, s3, s7
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
; CHECK-NEXT: s_add_u32 s0, s0, s4
-; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
; CHECK-NEXT: s_addc_u32 s1, s1, s5
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -215,8 +207,8 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v7
; CHECK-NEXT: v_readfirstlane_b32 s2, v6
-; CHECK-NEXT: v_mov_b32_e32 v4, s10
-; CHECK-NEXT: v_mov_b32_e32 v5, s8
+; CHECK-NEXT: v_mov_b32_e32 v4, s6
+; CHECK-NEXT: v_mov_b32_e32 v5, s7
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: s_mov_b32 s3, s2
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
@@ -233,14 +225,10 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_usub_v2i64:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_sub_u32 s10, s2, s6
-; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0
-; CHECK-NEXT: s_subb_u32 s8, s3, s7
+; CHECK-NEXT: s_sub_u32 s6, s2, s6
+; CHECK-NEXT: s_subb_u32 s7, s3, s7
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
; CHECK-NEXT: s_sub_u32 s0, s0, s4
-; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
; CHECK-NEXT: s_subb_u32 s1, s1, s5
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -249,8 +237,8 @@ define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v7
; CHECK-NEXT: v_readfirstlane_b32 s2, v6
-; CHECK-NEXT: v_mov_b32_e32 v4, s10
-; CHECK-NEXT: v_mov_b32_e32 v5, s8
+; CHECK-NEXT: v_mov_b32_e32 v4, s6
+; CHECK-NEXT: v_mov_b32_e32 v5, s7
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: s_mov_b32 s3, s2
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
@@ -268,8 +256,6 @@ define amdgpu_ps i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval)
; CHECK-LABEL: s_uadd_i64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s0, s0, s2
-; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0
; CHECK-NEXT: s_addc_u32 s1, s1, s3
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -292,8 +278,6 @@ define amdgpu_ps i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_uadd_p1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s0, s0, 1
-; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -339,8 +323,6 @@ define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_usub_p1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_sub_u32 s0, s0, 1
-; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_subb_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -363,8 +345,6 @@ define amdgpu_ps i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_usub_n1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_sub_u32 s0, s0, -1
-; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_subb_u32 s1, s1, -1
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 948811e..51df8c3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -7821,10 +7821,9 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_addc_u32 s15, 0, s16
; GFX6-NEXT: s_add_u32 s16, s0, s1
; GFX6-NEXT: v_mov_b32_e32 v0, s16
-; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_cmp_lg_u32 s0, 0
; GFX6-NEXT: s_addc_u32 s14, s14, s15
; GFX6-NEXT: s_mul_i32 s0, s12, s14
; GFX6-NEXT: v_readfirstlane_b32 s1, v0
@@ -7855,7 +7854,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_add_u32 s15, s16, s0
; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_cmp_lg_u32 s0, 0
; GFX6-NEXT: s_addc_u32 s14, s14, s12
; GFX6-NEXT: s_ashr_i32 s12, s7, 31
; GFX6-NEXT: s_add_u32 s0, s6, s12
@@ -7881,52 +7879,50 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: v_readfirstlane_b32 s4, v0
; GFX6-NEXT: s_addc_u32 s4, s4, 0
; GFX6-NEXT: s_mul_i32 s14, s7, s14
-; GFX6-NEXT: s_add_u32 s14, s1, s14
-; GFX6-NEXT: v_mov_b32_e32 v0, s14
+; GFX6-NEXT: s_add_u32 s16, s1, s14
+; GFX6-NEXT: v_mov_b32_e32 v0, s16
; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0
-; GFX6-NEXT: s_addc_u32 s15, 0, s4
+; GFX6-NEXT: s_addc_u32 s17, 0, s4
; GFX6-NEXT: s_mov_b32 s1, s5
-; GFX6-NEXT: s_mul_i32 s4, s10, s15
+; GFX6-NEXT: s_mul_i32 s4, s10, s17
; GFX6-NEXT: v_readfirstlane_b32 s5, v0
; GFX6-NEXT: s_add_i32 s4, s5, s4
-; GFX6-NEXT: s_mul_i32 s5, s11, s14
-; GFX6-NEXT: s_add_i32 s16, s4, s5
-; GFX6-NEXT: s_sub_i32 s17, s7, s16
-; GFX6-NEXT: s_mul_i32 s4, s10, s14
+; GFX6-NEXT: s_mul_i32 s5, s11, s16
+; GFX6-NEXT: s_add_i32 s18, s4, s5
+; GFX6-NEXT: s_sub_i32 s14, s7, s18
+; GFX6-NEXT: s_mul_i32 s4, s10, s16
; GFX6-NEXT: s_sub_u32 s6, s6, s4
; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT: s_or_b32 s18, s4, s5
-; GFX6-NEXT: s_cmp_lg_u32 s18, 0
-; GFX6-NEXT: s_subb_u32 s17, s17, s11
-; GFX6-NEXT: s_sub_u32 s19, s6, s10
-; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX6-NEXT: s_or_b32 s15, s4, s5
+; GFX6-NEXT: s_subb_u32 s19, s14, s11
+; GFX6-NEXT: s_sub_u32 s20, s6, s10
+; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT: s_or_b32 s14, s14, s15
+; GFX6-NEXT: s_subb_u32 s14, s19, 0
+; GFX6-NEXT: s_cmp_ge_u32 s14, s11
+; GFX6-NEXT: s_cselect_b32 s15, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s20, s10
+; GFX6-NEXT: s_cselect_b32 s19, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s14, s11
+; GFX6-NEXT: s_cselect_b32 s14, s19, s15
+; GFX6-NEXT: s_add_u32 s15, s16, 1
+; GFX6-NEXT: s_addc_u32 s19, s17, 0
+; GFX6-NEXT: s_add_u32 s20, s16, 2
+; GFX6-NEXT: s_addc_u32 s21, s17, 0
+; GFX6-NEXT: s_cmp_lg_u32 s14, 0
+; GFX6-NEXT: s_cselect_b32 s14, s20, s15
+; GFX6-NEXT: s_cselect_b32 s15, s21, s19
; GFX6-NEXT: s_or_b32 s4, s4, s5
-; GFX6-NEXT: s_cmp_lg_u32 s4, 0
-; GFX6-NEXT: s_subb_u32 s4, s17, 0
+; GFX6-NEXT: s_subb_u32 s4, s7, s18
; GFX6-NEXT: s_cmp_ge_u32 s4, s11
; GFX6-NEXT: s_cselect_b32 s5, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s19, s10
-; GFX6-NEXT: s_cselect_b32 s17, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s4, s11
-; GFX6-NEXT: s_cselect_b32 s4, s17, s5
-; GFX6-NEXT: s_add_u32 s5, s14, 1
-; GFX6-NEXT: s_addc_u32 s17, s15, 0
-; GFX6-NEXT: s_add_u32 s19, s14, 2
-; GFX6-NEXT: s_addc_u32 s20, s15, 0
-; GFX6-NEXT: s_cmp_lg_u32 s4, 0
-; GFX6-NEXT: s_cselect_b32 s4, s19, s5
-; GFX6-NEXT: s_cselect_b32 s5, s20, s17
-; GFX6-NEXT: s_cmp_lg_u32 s18, 0
-; GFX6-NEXT: s_subb_u32 s7, s7, s16
-; GFX6-NEXT: s_cmp_ge_u32 s7, s11
-; GFX6-NEXT: s_cselect_b32 s16, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s6, s10
; GFX6-NEXT: s_cselect_b32 s6, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s7, s11
-; GFX6-NEXT: s_cselect_b32 s6, s6, s16
-; GFX6-NEXT: s_cmp_lg_u32 s6, 0
-; GFX6-NEXT: s_cselect_b32 s5, s5, s15
-; GFX6-NEXT: s_cselect_b32 s4, s4, s14
+; GFX6-NEXT: s_cmp_eq_u32 s4, s11
+; GFX6-NEXT: s_cselect_b32 s4, s6, s5
+; GFX6-NEXT: s_cmp_lg_u32 s4, 0
+; GFX6-NEXT: s_cselect_b32 s5, s15, s17
+; GFX6-NEXT: s_cselect_b32 s4, s14, s16
; GFX6-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9]
; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GFX6-NEXT: s_sub_u32 s4, s4, s6
@@ -7949,8 +7945,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: s_sub_u32 s10, 0, s8
-; GFX9-NEXT: s_subb_u32 s11, 0, s9
+; GFX9-NEXT: s_sub_u32 s4, 0, s8
+; GFX9-NEXT: s_subb_u32 s5, 0, s9
; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -7960,56 +7956,52 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s12, v2
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: s_mul_i32 s5, s10, s12
-; GFX9-NEXT: s_mul_hi_u32 s14, s10, s4
-; GFX9-NEXT: s_mul_i32 s13, s11, s4
-; GFX9-NEXT: s_add_i32 s5, s14, s5
-; GFX9-NEXT: s_mul_i32 s15, s10, s4
-; GFX9-NEXT: s_add_i32 s5, s5, s13
-; GFX9-NEXT: s_mul_hi_u32 s14, s4, s15
-; GFX9-NEXT: s_mul_i32 s16, s4, s5
-; GFX9-NEXT: s_mul_hi_u32 s13, s4, s5
+; GFX9-NEXT: v_readfirstlane_b32 s10, v2
+; GFX9-NEXT: v_readfirstlane_b32 s11, v1
+; GFX9-NEXT: s_mul_i32 s12, s4, s10
+; GFX9-NEXT: s_mul_hi_u32 s14, s4, s11
+; GFX9-NEXT: s_mul_i32 s13, s5, s11
+; GFX9-NEXT: s_add_i32 s12, s14, s12
+; GFX9-NEXT: s_mul_i32 s15, s4, s11
+; GFX9-NEXT: s_add_i32 s12, s12, s13
+; GFX9-NEXT: s_mul_hi_u32 s14, s11, s15
+; GFX9-NEXT: s_mul_i32 s16, s11, s12
+; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12
; GFX9-NEXT: s_add_u32 s14, s14, s16
; GFX9-NEXT: s_addc_u32 s13, 0, s13
-; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15
-; GFX9-NEXT: s_mul_i32 s15, s12, s15
+; GFX9-NEXT: s_mul_hi_u32 s17, s10, s15
+; GFX9-NEXT: s_mul_i32 s15, s10, s15
; GFX9-NEXT: s_add_u32 s14, s14, s15
-; GFX9-NEXT: s_mul_hi_u32 s16, s12, s5
+; GFX9-NEXT: s_mul_hi_u32 s16, s10, s12
; GFX9-NEXT: s_addc_u32 s13, s13, s17
; GFX9-NEXT: s_addc_u32 s14, s16, 0
-; GFX9-NEXT: s_mul_i32 s5, s12, s5
-; GFX9-NEXT: s_add_u32 s5, s13, s5
+; GFX9-NEXT: s_mul_i32 s12, s10, s12
+; GFX9-NEXT: s_add_u32 s12, s13, s12
; GFX9-NEXT: s_addc_u32 s13, 0, s14
-; GFX9-NEXT: s_add_u32 s14, s4, s5
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s12, s12, s13
-; GFX9-NEXT: s_mul_i32 s4, s10, s12
-; GFX9-NEXT: s_mul_hi_u32 s5, s10, s14
-; GFX9-NEXT: s_add_i32 s4, s5, s4
-; GFX9-NEXT: s_mul_i32 s11, s11, s14
-; GFX9-NEXT: s_add_i32 s4, s4, s11
-; GFX9-NEXT: s_mul_i32 s10, s10, s14
-; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10
-; GFX9-NEXT: s_mul_i32 s13, s12, s10
-; GFX9-NEXT: s_mul_i32 s16, s14, s4
-; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10
-; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4
-; GFX9-NEXT: s_add_u32 s10, s10, s16
+; GFX9-NEXT: s_add_u32 s11, s11, s12
+; GFX9-NEXT: s_addc_u32 s10, s10, s13
+; GFX9-NEXT: s_mul_i32 s12, s4, s10
+; GFX9-NEXT: s_mul_hi_u32 s13, s4, s11
+; GFX9-NEXT: s_add_i32 s12, s13, s12
+; GFX9-NEXT: s_mul_i32 s5, s5, s11
+; GFX9-NEXT: s_add_i32 s12, s12, s5
+; GFX9-NEXT: s_mul_i32 s4, s4, s11
+; GFX9-NEXT: s_mul_hi_u32 s13, s10, s4
+; GFX9-NEXT: s_mul_i32 s14, s10, s4
+; GFX9-NEXT: s_mul_i32 s16, s11, s12
+; GFX9-NEXT: s_mul_hi_u32 s4, s11, s4
+; GFX9-NEXT: s_mul_hi_u32 s15, s11, s12
+; GFX9-NEXT: s_add_u32 s4, s4, s16
; GFX9-NEXT: s_addc_u32 s15, 0, s15
-; GFX9-NEXT: s_add_u32 s10, s10, s13
-; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4
-; GFX9-NEXT: s_addc_u32 s10, s15, s11
+; GFX9-NEXT: s_add_u32 s4, s4, s14
+; GFX9-NEXT: s_mul_hi_u32 s5, s10, s12
+; GFX9-NEXT: s_addc_u32 s4, s15, s13
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_mul_i32 s4, s12, s4
-; GFX9-NEXT: s_add_u32 s4, s10, s4
-; GFX9-NEXT: s_addc_u32 s10, 0, s5
-; GFX9-NEXT: s_add_u32 s11, s14, s4
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s10, s12, s10
+; GFX9-NEXT: s_mul_i32 s12, s10, s12
+; GFX9-NEXT: s_add_u32 s4, s4, s12
+; GFX9-NEXT: s_addc_u32 s5, 0, s5
+; GFX9-NEXT: s_add_u32 s11, s11, s4
+; GFX9-NEXT: s_addc_u32 s10, s10, s5
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s4, s3, 31
; GFX9-NEXT: s_add_u32 s2, s2, s4
@@ -8028,38 +8020,35 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: s_addc_u32 s11, s12, s15
; GFX9-NEXT: s_addc_u32 s12, s14, 0
; GFX9-NEXT: s_mul_i32 s10, s3, s10
-; GFX9-NEXT: s_add_u32 s14, s11, s10
-; GFX9-NEXT: s_addc_u32 s15, 0, s12
-; GFX9-NEXT: s_mul_i32 s10, s8, s15
-; GFX9-NEXT: s_mul_hi_u32 s11, s8, s14
+; GFX9-NEXT: s_add_u32 s13, s11, s10
+; GFX9-NEXT: s_addc_u32 s12, 0, s12
+; GFX9-NEXT: s_mul_i32 s10, s8, s12
+; GFX9-NEXT: s_mul_hi_u32 s11, s8, s13
; GFX9-NEXT: s_add_i32 s10, s11, s10
-; GFX9-NEXT: s_mul_i32 s11, s9, s14
-; GFX9-NEXT: s_add_i32 s16, s10, s11
-; GFX9-NEXT: s_sub_i32 s12, s3, s16
-; GFX9-NEXT: s_mul_i32 s10, s8, s14
+; GFX9-NEXT: s_mul_i32 s11, s9, s13
+; GFX9-NEXT: s_add_i32 s14, s10, s11
+; GFX9-NEXT: s_sub_i32 s15, s3, s14
+; GFX9-NEXT: s_mul_i32 s10, s8, s13
; GFX9-NEXT: s_sub_u32 s2, s2, s10
; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s17, s12, s9
-; GFX9-NEXT: s_sub_u32 s18, s2, s8
-; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT: s_subb_u32 s12, s17, 0
-; GFX9-NEXT: s_cmp_ge_u32 s12, s9
-; GFX9-NEXT: s_cselect_b32 s13, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s18, s8
+; GFX9-NEXT: s_subb_u32 s15, s15, s9
+; GFX9-NEXT: s_sub_u32 s16, s2, s8
+; GFX9-NEXT: s_subb_u32 s15, s15, 0
+; GFX9-NEXT: s_cmp_ge_u32 s15, s9
; GFX9-NEXT: s_cselect_b32 s17, -1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s12, s9
-; GFX9-NEXT: s_cselect_b32 s12, s17, s13
-; GFX9-NEXT: s_add_u32 s13, s14, 1
-; GFX9-NEXT: s_addc_u32 s17, s15, 0
-; GFX9-NEXT: s_add_u32 s18, s14, 2
-; GFX9-NEXT: s_addc_u32 s19, s15, 0
-; GFX9-NEXT: s_cmp_lg_u32 s12, 0
-; GFX9-NEXT: s_cselect_b32 s12, s18, s13
-; GFX9-NEXT: s_cselect_b32 s13, s19, s17
+; GFX9-NEXT: s_cmp_ge_u32 s16, s8
+; GFX9-NEXT: s_cselect_b32 s16, -1, 0
+; GFX9-NEXT: s_cmp_eq_u32 s15, s9
+; GFX9-NEXT: s_cselect_b32 s15, s16, s17
+; GFX9-NEXT: s_add_u32 s16, s13, 1
+; GFX9-NEXT: s_addc_u32 s17, s12, 0
+; GFX9-NEXT: s_add_u32 s18, s13, 2
+; GFX9-NEXT: s_addc_u32 s19, s12, 0
+; GFX9-NEXT: s_cmp_lg_u32 s15, 0
+; GFX9-NEXT: s_cselect_b32 s15, s18, s16
+; GFX9-NEXT: s_cselect_b32 s16, s19, s17
; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s3, s3, s16
+; GFX9-NEXT: s_subb_u32 s3, s3, s14
; GFX9-NEXT: s_cmp_ge_u32 s3, s9
; GFX9-NEXT: s_cselect_b32 s10, -1, 0
; GFX9-NEXT: s_cmp_ge_u32 s2, s8
@@ -8067,8 +8056,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: s_cmp_eq_u32 s3, s9
; GFX9-NEXT: s_cselect_b32 s2, s2, s10
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
-; GFX9-NEXT: s_cselect_b32 s3, s13, s15
-; GFX9-NEXT: s_cselect_b32 s2, s12, s14
+; GFX9-NEXT: s_cselect_b32 s3, s16, s12
+; GFX9-NEXT: s_cselect_b32 s2, s15, s13
; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
; GFX9-NEXT: s_sub_u32 s2, s2, s4
@@ -8328,10 +8317,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_addc_u32 s17, 0, s18
; GFX6-NEXT: s_add_u32 s18, s12, s13
; GFX6-NEXT: v_mov_b32_e32 v0, s18
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0
+; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
; GFX6-NEXT: s_addc_u32 s16, s16, s17
; GFX6-NEXT: s_mul_i32 s12, s14, s16
; GFX6-NEXT: v_readfirstlane_b32 s13, v0
@@ -8362,7 +8350,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_add_u32 s15, s18, s12
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
; GFX6-NEXT: s_addc_u32 s14, s16, s14
; GFX6-NEXT: s_ashr_i32 s12, s9, 31
; GFX6-NEXT: s_add_u32 s8, s8, s12
@@ -8387,55 +8374,53 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_readfirstlane_b32 s16, v0
; GFX6-NEXT: s_addc_u32 s16, s16, 0
; GFX6-NEXT: s_mul_i32 s14, s9, s14
-; GFX6-NEXT: s_add_u32 s17, s15, s14
-; GFX6-NEXT: v_mov_b32_e32 v0, s17
+; GFX6-NEXT: s_add_u32 s18, s15, s14
+; GFX6-NEXT: v_mov_b32_e32 v0, s18
; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
-; GFX6-NEXT: s_addc_u32 s16, 0, s16
-; GFX6-NEXT: s_mul_i32 s14, s6, s16
+; GFX6-NEXT: s_addc_u32 s19, 0, s16
+; GFX6-NEXT: s_mul_i32 s14, s6, s19
; GFX6-NEXT: v_readfirstlane_b32 s15, v0
; GFX6-NEXT: s_add_i32 s14, s15, s14
-; GFX6-NEXT: s_mul_i32 s15, s7, s17
-; GFX6-NEXT: s_add_i32 s18, s14, s15
-; GFX6-NEXT: s_sub_i32 s19, s9, s18
-; GFX6-NEXT: s_mul_i32 s14, s6, s17
+; GFX6-NEXT: s_mul_i32 s15, s7, s18
+; GFX6-NEXT: s_add_i32 s20, s14, s15
+; GFX6-NEXT: s_sub_i32 s16, s9, s20
+; GFX6-NEXT: s_mul_i32 s14, s6, s18
; GFX6-NEXT: s_sub_u32 s8, s8, s14
; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT: s_or_b32 s20, s14, s15
-; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_subb_u32 s19, s19, s7
-; GFX6-NEXT: s_sub_u32 s21, s8, s6
-; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT: s_or_b32 s17, s14, s15
+; GFX6-NEXT: s_subb_u32 s21, s16, s7
+; GFX6-NEXT: s_sub_u32 s22, s8, s6
+; GFX6-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GFX6-NEXT: s_or_b32 s16, s16, s17
+; GFX6-NEXT: s_subb_u32 s16, s21, 0
+; GFX6-NEXT: s_cmp_ge_u32 s16, s7
+; GFX6-NEXT: s_cselect_b32 s17, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s22, s6
+; GFX6-NEXT: s_cselect_b32 s21, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s16, s7
+; GFX6-NEXT: s_cselect_b32 s16, s21, s17
+; GFX6-NEXT: s_add_u32 s17, s18, 1
+; GFX6-NEXT: s_addc_u32 s21, s19, 0
+; GFX6-NEXT: s_add_u32 s22, s18, 2
+; GFX6-NEXT: s_addc_u32 s23, s19, 0
+; GFX6-NEXT: s_cmp_lg_u32 s16, 0
+; GFX6-NEXT: s_cselect_b32 s16, s22, s17
+; GFX6-NEXT: s_cselect_b32 s17, s23, s21
; GFX6-NEXT: s_or_b32 s14, s14, s15
-; GFX6-NEXT: s_cmp_lg_u32 s14, 0
-; GFX6-NEXT: s_subb_u32 s14, s19, 0
-; GFX6-NEXT: s_cmp_ge_u32 s14, s7
-; GFX6-NEXT: s_cselect_b32 s15, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s21, s6
-; GFX6-NEXT: s_cselect_b32 s19, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s14, s7
-; GFX6-NEXT: s_cselect_b32 s14, s19, s15
-; GFX6-NEXT: s_add_u32 s15, s17, 1
-; GFX6-NEXT: s_addc_u32 s19, s16, 0
-; GFX6-NEXT: s_add_u32 s21, s17, 2
-; GFX6-NEXT: s_addc_u32 s22, s16, 0
-; GFX6-NEXT: s_cmp_lg_u32 s14, 0
-; GFX6-NEXT: s_cselect_b32 s14, s21, s15
-; GFX6-NEXT: s_cselect_b32 s15, s22, s19
-; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_subb_u32 s9, s9, s18
+; GFX6-NEXT: s_subb_u32 s9, s9, s20
; GFX6-NEXT: s_cmp_ge_u32 s9, s7
-; GFX6-NEXT: s_cselect_b32 s18, -1, 0
+; GFX6-NEXT: s_cselect_b32 s14, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s8, s6
; GFX6-NEXT: s_cselect_b32 s6, -1, 0
; GFX6-NEXT: s_cmp_eq_u32 s9, s7
-; GFX6-NEXT: s_cselect_b32 s6, s6, s18
+; GFX6-NEXT: s_cselect_b32 s6, s6, s14
; GFX6-NEXT: s_cmp_lg_u32 s6, 0
-; GFX6-NEXT: s_cselect_b32 s7, s15, s16
-; GFX6-NEXT: s_cselect_b32 s6, s14, s17
+; GFX6-NEXT: s_cselect_b32 s7, s17, s19
+; GFX6-NEXT: s_cselect_b32 s6, s16, s18
; GFX6-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3]
; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GFX6-NEXT: s_sub_u32 s14, s6, s2
-; GFX6-NEXT: s_subb_u32 s15, s7, s3
+; GFX6-NEXT: s_sub_u32 s16, s6, s2
+; GFX6-NEXT: s_subb_u32 s17, s7, s3
; GFX6-NEXT: s_ashr_i32 s6, s1, 31
; GFX6-NEXT: s_add_u32 s0, s0, s6
; GFX6-NEXT: s_mov_b32 s7, s6
@@ -8454,40 +8439,39 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0
-; GFX6-NEXT: v_readfirstlane_b32 s16, v1
+; GFX6-NEXT: v_readfirstlane_b32 s14, v1
; GFX6-NEXT: v_readfirstlane_b32 s2, v0
-; GFX6-NEXT: s_mul_i32 s1, s12, s16
+; GFX6-NEXT: s_mul_i32 s1, s12, s14
; GFX6-NEXT: v_readfirstlane_b32 s3, v2
; GFX6-NEXT: s_mul_i32 s0, s13, s2
; GFX6-NEXT: s_add_i32 s1, s3, s1
; GFX6-NEXT: s_add_i32 s3, s1, s0
-; GFX6-NEXT: s_mul_i32 s17, s12, s2
+; GFX6-NEXT: s_mul_i32 s15, s12, s2
; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT: v_mul_hi_u32 v0, v0, s17
+; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX6-NEXT: s_mul_i32 s4, s2, s3
; GFX6-NEXT: v_readfirstlane_b32 s5, v2
; GFX6-NEXT: v_readfirstlane_b32 s18, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, v1, s17
+; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15
; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3
; GFX6-NEXT: s_add_u32 s4, s18, s4
; GFX6-NEXT: s_addc_u32 s5, 0, s5
-; GFX6-NEXT: s_mul_i32 s17, s16, s17
+; GFX6-NEXT: s_mul_i32 s15, s14, s15
; GFX6-NEXT: v_readfirstlane_b32 s18, v0
-; GFX6-NEXT: s_add_u32 s4, s4, s17
+; GFX6-NEXT: s_add_u32 s4, s4, s15
; GFX6-NEXT: s_addc_u32 s4, s5, s18
; GFX6-NEXT: v_readfirstlane_b32 s5, v1
; GFX6-NEXT: s_addc_u32 s5, s5, 0
-; GFX6-NEXT: s_mul_i32 s3, s16, s3
+; GFX6-NEXT: s_mul_i32 s3, s14, s3
; GFX6-NEXT: s_add_u32 s3, s4, s3
; GFX6-NEXT: s_addc_u32 s4, 0, s5
; GFX6-NEXT: s_add_u32 s5, s2, s3
; GFX6-NEXT: v_mov_b32_e32 v0, s5
-; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_cmp_lg_u32 s2, 0
-; GFX6-NEXT: s_addc_u32 s4, s16, s4
+; GFX6-NEXT: s_addc_u32 s4, s14, s4
; GFX6-NEXT: s_mul_i32 s2, s12, s4
; GFX6-NEXT: v_readfirstlane_b32 s3, v0
; GFX6-NEXT: s_add_i32 s2, s3, s2
@@ -8501,14 +8485,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0
; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX6-NEXT: s_mul_i32 s13, s5, s2
-; GFX6-NEXT: v_readfirstlane_b32 s17, v2
-; GFX6-NEXT: s_add_u32 s13, s17, s13
-; GFX6-NEXT: v_readfirstlane_b32 s16, v0
+; GFX6-NEXT: v_readfirstlane_b32 s15, v2
+; GFX6-NEXT: s_add_u32 s13, s15, s13
+; GFX6-NEXT: v_readfirstlane_b32 s14, v0
; GFX6-NEXT: s_mul_i32 s3, s4, s3
-; GFX6-NEXT: s_addc_u32 s16, 0, s16
+; GFX6-NEXT: s_addc_u32 s14, 0, s14
; GFX6-NEXT: v_readfirstlane_b32 s12, v3
; GFX6-NEXT: s_add_u32 s3, s13, s3
-; GFX6-NEXT: s_addc_u32 s3, s16, s12
+; GFX6-NEXT: s_addc_u32 s3, s14, s12
; GFX6-NEXT: v_readfirstlane_b32 s12, v1
; GFX6-NEXT: s_addc_u32 s12, s12, 0
; GFX6-NEXT: s_mul_i32 s2, s4, s2
@@ -8517,7 +8501,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_add_u32 s13, s5, s2
; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_cmp_lg_u32 s2, 0
; GFX6-NEXT: s_addc_u32 s12, s4, s12
; GFX6-NEXT: s_ashr_i32 s4, s11, 31
; GFX6-NEXT: s_add_u32 s2, s10, s4
@@ -8529,72 +8512,70 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mov_b32_e32 v2, s13
; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2
; GFX6-NEXT: s_mul_i32 s2, s10, s12
-; GFX6-NEXT: v_readfirstlane_b32 s16, v1
+; GFX6-NEXT: v_readfirstlane_b32 s14, v1
; GFX6-NEXT: v_mul_hi_u32 v1, s11, v2
-; GFX6-NEXT: v_readfirstlane_b32 s17, v3
+; GFX6-NEXT: v_readfirstlane_b32 s15, v3
; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0
-; GFX6-NEXT: s_add_u32 s2, s17, s2
-; GFX6-NEXT: s_addc_u32 s16, 0, s16
+; GFX6-NEXT: s_add_u32 s2, s15, s2
+; GFX6-NEXT: s_addc_u32 s14, 0, s14
; GFX6-NEXT: s_mul_i32 s13, s11, s13
-; GFX6-NEXT: v_readfirstlane_b32 s17, v1
+; GFX6-NEXT: v_readfirstlane_b32 s15, v1
; GFX6-NEXT: s_add_u32 s2, s2, s13
-; GFX6-NEXT: s_addc_u32 s2, s16, s17
+; GFX6-NEXT: s_addc_u32 s2, s14, s15
; GFX6-NEXT: v_readfirstlane_b32 s13, v0
; GFX6-NEXT: s_addc_u32 s13, s13, 0
; GFX6-NEXT: s_mul_i32 s12, s11, s12
-; GFX6-NEXT: s_add_u32 s16, s2, s12
-; GFX6-NEXT: v_mov_b32_e32 v0, s16
+; GFX6-NEXT: s_add_u32 s18, s2, s12
+; GFX6-NEXT: v_mov_b32_e32 v0, s18
; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
-; GFX6-NEXT: s_addc_u32 s17, 0, s13
-; GFX6-NEXT: s_mul_i32 s12, s8, s17
+; GFX6-NEXT: s_addc_u32 s19, 0, s13
+; GFX6-NEXT: s_mul_i32 s12, s8, s19
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: v_readfirstlane_b32 s13, v0
; GFX6-NEXT: s_add_i32 s12, s13, s12
-; GFX6-NEXT: s_mul_i32 s13, s9, s16
-; GFX6-NEXT: s_add_i32 s18, s12, s13
-; GFX6-NEXT: s_sub_i32 s19, s11, s18
-; GFX6-NEXT: s_mul_i32 s12, s8, s16
+; GFX6-NEXT: s_mul_i32 s13, s9, s18
+; GFX6-NEXT: s_add_i32 s20, s12, s13
+; GFX6-NEXT: s_sub_i32 s14, s11, s20
+; GFX6-NEXT: s_mul_i32 s12, s8, s18
; GFX6-NEXT: s_sub_u32 s10, s10, s12
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s20, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_subb_u32 s19, s19, s9
-; GFX6-NEXT: s_sub_u32 s21, s10, s8
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: s_or_b32 s15, s12, s13
+; GFX6-NEXT: s_subb_u32 s21, s14, s9
+; GFX6-NEXT: s_sub_u32 s22, s10, s8
+; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT: s_or_b32 s14, s14, s15
+; GFX6-NEXT: s_subb_u32 s14, s21, 0
+; GFX6-NEXT: s_cmp_ge_u32 s14, s9
+; GFX6-NEXT: s_cselect_b32 s15, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s22, s8
+; GFX6-NEXT: s_cselect_b32 s21, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s14, s9
+; GFX6-NEXT: s_cselect_b32 s14, s21, s15
+; GFX6-NEXT: s_add_u32 s15, s18, 1
+; GFX6-NEXT: s_addc_u32 s21, s19, 0
+; GFX6-NEXT: s_add_u32 s22, s18, 2
+; GFX6-NEXT: s_addc_u32 s23, s19, 0
+; GFX6-NEXT: s_cmp_lg_u32 s14, 0
+; GFX6-NEXT: s_cselect_b32 s14, s22, s15
+; GFX6-NEXT: s_cselect_b32 s15, s23, s21
; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_subb_u32 s12, s19, 0
-; GFX6-NEXT: s_cmp_ge_u32 s12, s9
-; GFX6-NEXT: s_cselect_b32 s13, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s21, s8
-; GFX6-NEXT: s_cselect_b32 s19, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s12, s9
-; GFX6-NEXT: s_cselect_b32 s12, s19, s13
-; GFX6-NEXT: s_add_u32 s13, s16, 1
-; GFX6-NEXT: s_addc_u32 s19, s17, 0
-; GFX6-NEXT: s_add_u32 s21, s16, 2
-; GFX6-NEXT: s_addc_u32 s22, s17, 0
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_cselect_b32 s12, s21, s13
-; GFX6-NEXT: s_cselect_b32 s13, s22, s19
-; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_subb_u32 s11, s11, s18
+; GFX6-NEXT: s_subb_u32 s11, s11, s20
; GFX6-NEXT: s_cmp_ge_u32 s11, s9
-; GFX6-NEXT: s_cselect_b32 s18, -1, 0
+; GFX6-NEXT: s_cselect_b32 s12, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s10, s8
; GFX6-NEXT: s_cselect_b32 s8, -1, 0
; GFX6-NEXT: s_cmp_eq_u32 s11, s9
-; GFX6-NEXT: s_cselect_b32 s8, s8, s18
+; GFX6-NEXT: s_cselect_b32 s8, s8, s12
; GFX6-NEXT: s_cmp_lg_u32 s8, 0
-; GFX6-NEXT: s_cselect_b32 s9, s13, s17
-; GFX6-NEXT: s_cselect_b32 s8, s12, s16
+; GFX6-NEXT: s_cselect_b32 s9, s15, s19
+; GFX6-NEXT: s_cselect_b32 s8, s14, s18
; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GFX6-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5]
; GFX6-NEXT: s_sub_u32 s4, s6, s4
; GFX6-NEXT: s_subb_u32 s5, s7, s5
; GFX6-NEXT: s_mov_b32 s2, -1
-; GFX6-NEXT: v_mov_b32_e32 v0, s14
-; GFX6-NEXT: v_mov_b32_e32 v1, s15
+; GFX6-NEXT: v_mov_b32_e32 v0, s16
+; GFX6-NEXT: v_mov_b32_e32 v1, s17
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: v_mov_b32_e32 v3, s5
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
@@ -8614,8 +8595,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3]
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT: s_sub_u32 s14, 0, s6
-; GFX9-NEXT: s_subb_u32 s15, 0, s7
+; GFX9-NEXT: s_sub_u32 s12, 0, s6
+; GFX9-NEXT: s_subb_u32 s13, 0, s7
; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX9-NEXT: v_rcp_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -8624,56 +8605,52 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s16, v1
-; GFX9-NEXT: v_readfirstlane_b32 s12, v0
-; GFX9-NEXT: s_mul_i32 s13, s14, s16
-; GFX9-NEXT: s_mul_hi_u32 s18, s14, s12
-; GFX9-NEXT: s_mul_i32 s17, s15, s12
-; GFX9-NEXT: s_add_i32 s13, s18, s13
-; GFX9-NEXT: s_mul_i32 s19, s14, s12
-; GFX9-NEXT: s_add_i32 s13, s13, s17
-; GFX9-NEXT: s_mul_hi_u32 s18, s12, s19
-; GFX9-NEXT: s_mul_i32 s20, s12, s13
-; GFX9-NEXT: s_mul_hi_u32 s17, s12, s13
+; GFX9-NEXT: v_readfirstlane_b32 s14, v1
+; GFX9-NEXT: v_readfirstlane_b32 s15, v0
+; GFX9-NEXT: s_mul_i32 s16, s12, s14
+; GFX9-NEXT: s_mul_hi_u32 s18, s12, s15
+; GFX9-NEXT: s_mul_i32 s17, s13, s15
+; GFX9-NEXT: s_add_i32 s16, s18, s16
+; GFX9-NEXT: s_mul_i32 s19, s12, s15
+; GFX9-NEXT: s_add_i32 s16, s16, s17
+; GFX9-NEXT: s_mul_hi_u32 s18, s15, s19
+; GFX9-NEXT: s_mul_i32 s20, s15, s16
+; GFX9-NEXT: s_mul_hi_u32 s17, s15, s16
; GFX9-NEXT: s_add_u32 s18, s18, s20
; GFX9-NEXT: s_addc_u32 s17, 0, s17
-; GFX9-NEXT: s_mul_hi_u32 s20, s16, s19
-; GFX9-NEXT: s_mul_i32 s19, s16, s19
+; GFX9-NEXT: s_mul_hi_u32 s20, s14, s19
+; GFX9-NEXT: s_mul_i32 s19, s14, s19
; GFX9-NEXT: s_add_u32 s18, s18, s19
-; GFX9-NEXT: s_mul_hi_u32 s21, s16, s13
+; GFX9-NEXT: s_mul_hi_u32 s21, s14, s16
; GFX9-NEXT: s_addc_u32 s17, s17, s20
; GFX9-NEXT: s_addc_u32 s18, s21, 0
-; GFX9-NEXT: s_mul_i32 s13, s16, s13
-; GFX9-NEXT: s_add_u32 s13, s17, s13
+; GFX9-NEXT: s_mul_i32 s16, s14, s16
+; GFX9-NEXT: s_add_u32 s16, s17, s16
; GFX9-NEXT: s_addc_u32 s17, 0, s18
-; GFX9-NEXT: s_add_u32 s18, s12, s13
-; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT: s_addc_u32 s16, s16, s17
-; GFX9-NEXT: s_mul_i32 s12, s14, s16
-; GFX9-NEXT: s_mul_hi_u32 s13, s14, s18
-; GFX9-NEXT: s_add_i32 s12, s13, s12
-; GFX9-NEXT: s_mul_i32 s15, s15, s18
-; GFX9-NEXT: s_add_i32 s12, s12, s15
-; GFX9-NEXT: s_mul_i32 s14, s14, s18
-; GFX9-NEXT: s_mul_hi_u32 s15, s16, s14
-; GFX9-NEXT: s_mul_i32 s17, s16, s14
-; GFX9-NEXT: s_mul_i32 s20, s18, s12
-; GFX9-NEXT: s_mul_hi_u32 s14, s18, s14
-; GFX9-NEXT: s_mul_hi_u32 s19, s18, s12
-; GFX9-NEXT: s_add_u32 s14, s14, s20
+; GFX9-NEXT: s_add_u32 s15, s15, s16
+; GFX9-NEXT: s_addc_u32 s14, s14, s17
+; GFX9-NEXT: s_mul_i32 s16, s12, s14
+; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15
+; GFX9-NEXT: s_add_i32 s16, s17, s16
+; GFX9-NEXT: s_mul_i32 s13, s13, s15
+; GFX9-NEXT: s_add_i32 s16, s16, s13
+; GFX9-NEXT: s_mul_i32 s12, s12, s15
+; GFX9-NEXT: s_mul_hi_u32 s17, s14, s12
+; GFX9-NEXT: s_mul_i32 s18, s14, s12
+; GFX9-NEXT: s_mul_i32 s20, s15, s16
+; GFX9-NEXT: s_mul_hi_u32 s12, s15, s12
+; GFX9-NEXT: s_mul_hi_u32 s19, s15, s16
+; GFX9-NEXT: s_add_u32 s12, s12, s20
; GFX9-NEXT: s_addc_u32 s19, 0, s19
-; GFX9-NEXT: s_add_u32 s14, s14, s17
-; GFX9-NEXT: s_mul_hi_u32 s13, s16, s12
-; GFX9-NEXT: s_addc_u32 s14, s19, s15
+; GFX9-NEXT: s_add_u32 s12, s12, s18
+; GFX9-NEXT: s_mul_hi_u32 s13, s14, s16
+; GFX9-NEXT: s_addc_u32 s12, s19, s17
; GFX9-NEXT: s_addc_u32 s13, s13, 0
-; GFX9-NEXT: s_mul_i32 s12, s16, s12
-; GFX9-NEXT: s_add_u32 s12, s14, s12
-; GFX9-NEXT: s_addc_u32 s14, 0, s13
-; GFX9-NEXT: s_add_u32 s15, s18, s12
-; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT: s_addc_u32 s14, s16, s14
+; GFX9-NEXT: s_mul_i32 s16, s14, s16
+; GFX9-NEXT: s_add_u32 s12, s12, s16
+; GFX9-NEXT: s_addc_u32 s13, 0, s13
+; GFX9-NEXT: s_add_u32 s15, s15, s12
+; GFX9-NEXT: s_addc_u32 s14, s14, s13
; GFX9-NEXT: s_ashr_i32 s12, s9, 31
; GFX9-NEXT: s_add_u32 s8, s8, s12
; GFX9-NEXT: s_mov_b32 s13, s12
@@ -8691,38 +8668,35 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_addc_u32 s15, s16, s19
; GFX9-NEXT: s_addc_u32 s16, s18, 0
; GFX9-NEXT: s_mul_i32 s14, s9, s14
-; GFX9-NEXT: s_add_u32 s18, s15, s14
-; GFX9-NEXT: s_addc_u32 s19, 0, s16
-; GFX9-NEXT: s_mul_i32 s14, s6, s19
-; GFX9-NEXT: s_mul_hi_u32 s15, s6, s18
+; GFX9-NEXT: s_add_u32 s17, s15, s14
+; GFX9-NEXT: s_addc_u32 s16, 0, s16
+; GFX9-NEXT: s_mul_i32 s14, s6, s16
+; GFX9-NEXT: s_mul_hi_u32 s15, s6, s17
; GFX9-NEXT: s_add_i32 s14, s15, s14
-; GFX9-NEXT: s_mul_i32 s15, s7, s18
-; GFX9-NEXT: s_add_i32 s20, s14, s15
-; GFX9-NEXT: s_sub_i32 s16, s9, s20
-; GFX9-NEXT: s_mul_i32 s14, s6, s18
+; GFX9-NEXT: s_mul_i32 s15, s7, s17
+; GFX9-NEXT: s_add_i32 s18, s14, s15
+; GFX9-NEXT: s_sub_i32 s19, s9, s18
+; GFX9-NEXT: s_mul_i32 s14, s6, s17
; GFX9-NEXT: s_sub_u32 s8, s8, s14
; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT: s_subb_u32 s21, s16, s7
-; GFX9-NEXT: s_sub_u32 s22, s8, s6
-; GFX9-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[16:17], 0
-; GFX9-NEXT: s_subb_u32 s16, s21, 0
-; GFX9-NEXT: s_cmp_ge_u32 s16, s7
-; GFX9-NEXT: s_cselect_b32 s17, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s22, s6
+; GFX9-NEXT: s_subb_u32 s19, s19, s7
+; GFX9-NEXT: s_sub_u32 s20, s8, s6
+; GFX9-NEXT: s_subb_u32 s19, s19, 0
+; GFX9-NEXT: s_cmp_ge_u32 s19, s7
; GFX9-NEXT: s_cselect_b32 s21, -1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s16, s7
-; GFX9-NEXT: s_cselect_b32 s16, s21, s17
-; GFX9-NEXT: s_add_u32 s17, s18, 1
-; GFX9-NEXT: s_addc_u32 s21, s19, 0
-; GFX9-NEXT: s_add_u32 s22, s18, 2
-; GFX9-NEXT: s_addc_u32 s23, s19, 0
-; GFX9-NEXT: s_cmp_lg_u32 s16, 0
-; GFX9-NEXT: s_cselect_b32 s16, s22, s17
-; GFX9-NEXT: s_cselect_b32 s17, s23, s21
+; GFX9-NEXT: s_cmp_ge_u32 s20, s6
+; GFX9-NEXT: s_cselect_b32 s20, -1, 0
+; GFX9-NEXT: s_cmp_eq_u32 s19, s7
+; GFX9-NEXT: s_cselect_b32 s19, s20, s21
+; GFX9-NEXT: s_add_u32 s20, s17, 1
+; GFX9-NEXT: s_addc_u32 s21, s16, 0
+; GFX9-NEXT: s_add_u32 s22, s17, 2
+; GFX9-NEXT: s_addc_u32 s23, s16, 0
+; GFX9-NEXT: s_cmp_lg_u32 s19, 0
+; GFX9-NEXT: s_cselect_b32 s19, s22, s20
+; GFX9-NEXT: s_cselect_b32 s20, s23, s21
; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT: s_subb_u32 s9, s9, s20
+; GFX9-NEXT: s_subb_u32 s9, s9, s18
; GFX9-NEXT: s_cmp_ge_u32 s9, s7
; GFX9-NEXT: s_cselect_b32 s14, -1, 0
; GFX9-NEXT: s_cmp_ge_u32 s8, s6
@@ -8730,12 +8704,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cmp_eq_u32 s9, s7
; GFX9-NEXT: s_cselect_b32 s6, s6, s14
; GFX9-NEXT: s_cmp_lg_u32 s6, 0
-; GFX9-NEXT: s_cselect_b32 s7, s17, s19
-; GFX9-NEXT: s_cselect_b32 s6, s16, s18
+; GFX9-NEXT: s_cselect_b32 s7, s20, s16
+; GFX9-NEXT: s_cselect_b32 s6, s19, s17
; GFX9-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3]
; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GFX9-NEXT: s_sub_u32 s14, s6, s2
-; GFX9-NEXT: s_subb_u32 s15, s7, s3
+; GFX9-NEXT: s_sub_u32 s12, s6, s2
+; GFX9-NEXT: s_subb_u32 s13, s7, s3
; GFX9-NEXT: s_ashr_i32 s2, s1, 31
; GFX9-NEXT: s_add_u32 s0, s0, s2
; GFX9-NEXT: s_mov_b32 s3, s2
@@ -8744,8 +8718,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT: s_sub_u32 s8, 0, s6
-; GFX9-NEXT: s_subb_u32 s9, 0, s7
+; GFX9-NEXT: s_sub_u32 s4, 0, s6
+; GFX9-NEXT: s_subb_u32 s5, 0, s7
; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -8755,105 +8729,98 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: v_readfirstlane_b32 s13, v2
-; GFX9-NEXT: s_mul_hi_u32 s12, s8, s4
-; GFX9-NEXT: s_mul_i32 s16, s8, s13
-; GFX9-NEXT: s_mul_i32 s5, s9, s4
-; GFX9-NEXT: s_add_i32 s12, s12, s16
-; GFX9-NEXT: s_add_i32 s12, s12, s5
-; GFX9-NEXT: s_mul_i32 s17, s8, s4
-; GFX9-NEXT: s_mul_i32 s16, s4, s12
-; GFX9-NEXT: s_mul_hi_u32 s18, s4, s17
-; GFX9-NEXT: s_mul_hi_u32 s5, s4, s12
+; GFX9-NEXT: v_readfirstlane_b32 s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s15, v2
+; GFX9-NEXT: s_mul_hi_u32 s14, s4, s8
+; GFX9-NEXT: s_mul_i32 s16, s4, s15
+; GFX9-NEXT: s_mul_i32 s9, s5, s8
+; GFX9-NEXT: s_add_i32 s14, s14, s16
+; GFX9-NEXT: s_add_i32 s14, s14, s9
+; GFX9-NEXT: s_mul_i32 s17, s4, s8
+; GFX9-NEXT: s_mul_i32 s16, s8, s14
+; GFX9-NEXT: s_mul_hi_u32 s18, s8, s17
+; GFX9-NEXT: s_mul_hi_u32 s9, s8, s14
; GFX9-NEXT: s_add_u32 s16, s18, s16
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_mul_hi_u32 s19, s13, s17
-; GFX9-NEXT: s_mul_i32 s17, s13, s17
+; GFX9-NEXT: s_addc_u32 s9, 0, s9
+; GFX9-NEXT: s_mul_hi_u32 s19, s15, s17
+; GFX9-NEXT: s_mul_i32 s17, s15, s17
; GFX9-NEXT: s_add_u32 s16, s16, s17
-; GFX9-NEXT: s_mul_hi_u32 s18, s13, s12
-; GFX9-NEXT: s_addc_u32 s5, s5, s19
+; GFX9-NEXT: s_mul_hi_u32 s18, s15, s14
+; GFX9-NEXT: s_addc_u32 s9, s9, s19
; GFX9-NEXT: s_addc_u32 s16, s18, 0
-; GFX9-NEXT: s_mul_i32 s12, s13, s12
-; GFX9-NEXT: s_add_u32 s5, s5, s12
-; GFX9-NEXT: s_addc_u32 s12, 0, s16
-; GFX9-NEXT: s_add_u32 s16, s4, s5
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s12, s13, s12
-; GFX9-NEXT: s_mul_i32 s4, s8, s12
-; GFX9-NEXT: s_mul_hi_u32 s5, s8, s16
-; GFX9-NEXT: s_add_i32 s4, s5, s4
-; GFX9-NEXT: s_mul_i32 s9, s9, s16
-; GFX9-NEXT: s_add_i32 s4, s4, s9
-; GFX9-NEXT: s_mul_i32 s8, s8, s16
-; GFX9-NEXT: s_mul_hi_u32 s9, s12, s8
-; GFX9-NEXT: s_mul_i32 s13, s12, s8
-; GFX9-NEXT: s_mul_i32 s18, s16, s4
-; GFX9-NEXT: s_mul_hi_u32 s8, s16, s8
-; GFX9-NEXT: s_mul_hi_u32 s17, s16, s4
-; GFX9-NEXT: s_add_u32 s8, s8, s18
+; GFX9-NEXT: s_mul_i32 s14, s15, s14
+; GFX9-NEXT: s_add_u32 s9, s9, s14
+; GFX9-NEXT: s_addc_u32 s14, 0, s16
+; GFX9-NEXT: s_add_u32 s8, s8, s9
+; GFX9-NEXT: s_addc_u32 s9, s15, s14
+; GFX9-NEXT: s_mul_i32 s14, s4, s9
+; GFX9-NEXT: s_mul_hi_u32 s15, s4, s8
+; GFX9-NEXT: s_add_i32 s14, s15, s14
+; GFX9-NEXT: s_mul_i32 s5, s5, s8
+; GFX9-NEXT: s_add_i32 s14, s14, s5
+; GFX9-NEXT: s_mul_i32 s4, s4, s8
+; GFX9-NEXT: s_mul_hi_u32 s15, s9, s4
+; GFX9-NEXT: s_mul_i32 s16, s9, s4
+; GFX9-NEXT: s_mul_i32 s18, s8, s14
+; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4
+; GFX9-NEXT: s_mul_hi_u32 s17, s8, s14
+; GFX9-NEXT: s_add_u32 s4, s4, s18
; GFX9-NEXT: s_addc_u32 s17, 0, s17
-; GFX9-NEXT: s_add_u32 s8, s8, s13
-; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4
-; GFX9-NEXT: s_addc_u32 s8, s17, s9
+; GFX9-NEXT: s_add_u32 s4, s4, s16
+; GFX9-NEXT: s_mul_hi_u32 s5, s9, s14
+; GFX9-NEXT: s_addc_u32 s4, s17, s15
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_mul_i32 s4, s12, s4
-; GFX9-NEXT: s_add_u32 s4, s8, s4
-; GFX9-NEXT: s_addc_u32 s8, 0, s5
-; GFX9-NEXT: s_add_u32 s13, s16, s4
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s12, s12, s8
+; GFX9-NEXT: s_mul_i32 s14, s9, s14
+; GFX9-NEXT: s_add_u32 s4, s4, s14
+; GFX9-NEXT: s_addc_u32 s5, 0, s5
+; GFX9-NEXT: s_add_u32 s14, s8, s4
+; GFX9-NEXT: s_addc_u32 s15, s9, s5
; GFX9-NEXT: s_ashr_i32 s4, s11, 31
; GFX9-NEXT: s_add_u32 s8, s10, s4
; GFX9-NEXT: s_mov_b32 s5, s4
; GFX9-NEXT: s_addc_u32 s9, s11, s4
; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5]
-; GFX9-NEXT: s_mul_i32 s11, s8, s12
-; GFX9-NEXT: s_mul_hi_u32 s16, s8, s13
-; GFX9-NEXT: s_mul_hi_u32 s10, s8, s12
+; GFX9-NEXT: s_mul_i32 s11, s8, s15
+; GFX9-NEXT: s_mul_hi_u32 s16, s8, s14
+; GFX9-NEXT: s_mul_hi_u32 s10, s8, s15
; GFX9-NEXT: s_add_u32 s11, s16, s11
; GFX9-NEXT: s_addc_u32 s10, 0, s10
-; GFX9-NEXT: s_mul_hi_u32 s17, s9, s13
-; GFX9-NEXT: s_mul_i32 s13, s9, s13
-; GFX9-NEXT: s_add_u32 s11, s11, s13
-; GFX9-NEXT: s_mul_hi_u32 s16, s9, s12
+; GFX9-NEXT: s_mul_hi_u32 s17, s9, s14
+; GFX9-NEXT: s_mul_i32 s14, s9, s14
+; GFX9-NEXT: s_add_u32 s11, s11, s14
+; GFX9-NEXT: s_mul_hi_u32 s16, s9, s15
; GFX9-NEXT: s_addc_u32 s10, s10, s17
; GFX9-NEXT: s_addc_u32 s11, s16, 0
-; GFX9-NEXT: s_mul_i32 s12, s9, s12
-; GFX9-NEXT: s_add_u32 s16, s10, s12
-; GFX9-NEXT: s_addc_u32 s17, 0, s11
-; GFX9-NEXT: s_mul_i32 s10, s6, s17
-; GFX9-NEXT: s_mul_hi_u32 s11, s6, s16
+; GFX9-NEXT: s_mul_i32 s14, s9, s15
+; GFX9-NEXT: s_add_u32 s14, s10, s14
+; GFX9-NEXT: s_addc_u32 s15, 0, s11
+; GFX9-NEXT: s_mul_i32 s10, s6, s15
+; GFX9-NEXT: s_mul_hi_u32 s11, s6, s14
; GFX9-NEXT: s_add_i32 s10, s11, s10
-; GFX9-NEXT: s_mul_i32 s11, s7, s16
-; GFX9-NEXT: s_add_i32 s18, s10, s11
-; GFX9-NEXT: s_sub_i32 s12, s9, s18
-; GFX9-NEXT: s_mul_i32 s10, s6, s16
+; GFX9-NEXT: s_mul_i32 s11, s7, s14
+; GFX9-NEXT: s_add_i32 s16, s10, s11
+; GFX9-NEXT: s_sub_i32 s17, s9, s16
+; GFX9-NEXT: s_mul_i32 s10, s6, s14
; GFX9-NEXT: s_sub_u32 s8, s8, s10
; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s19, s12, s7
-; GFX9-NEXT: s_sub_u32 s20, s8, s6
-; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT: s_subb_u32 s12, s19, 0
-; GFX9-NEXT: s_cmp_ge_u32 s12, s7
-; GFX9-NEXT: s_cselect_b32 s13, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s20, s6
+; GFX9-NEXT: s_subb_u32 s17, s17, s7
+; GFX9-NEXT: s_sub_u32 s18, s8, s6
+; GFX9-NEXT: s_subb_u32 s17, s17, 0
+; GFX9-NEXT: s_cmp_ge_u32 s17, s7
; GFX9-NEXT: s_cselect_b32 s19, -1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s12, s7
-; GFX9-NEXT: s_cselect_b32 s12, s19, s13
-; GFX9-NEXT: s_add_u32 s13, s16, 1
-; GFX9-NEXT: s_addc_u32 s19, s17, 0
-; GFX9-NEXT: s_add_u32 s20, s16, 2
-; GFX9-NEXT: s_addc_u32 s21, s17, 0
-; GFX9-NEXT: s_cmp_lg_u32 s12, 0
-; GFX9-NEXT: s_cselect_b32 s12, s20, s13
-; GFX9-NEXT: s_cselect_b32 s13, s21, s19
+; GFX9-NEXT: s_cmp_ge_u32 s18, s6
+; GFX9-NEXT: s_cselect_b32 s18, -1, 0
+; GFX9-NEXT: s_cmp_eq_u32 s17, s7
+; GFX9-NEXT: s_cselect_b32 s17, s18, s19
+; GFX9-NEXT: s_add_u32 s18, s14, 1
+; GFX9-NEXT: s_addc_u32 s19, s15, 0
+; GFX9-NEXT: s_add_u32 s20, s14, 2
+; GFX9-NEXT: s_addc_u32 s21, s15, 0
+; GFX9-NEXT: s_cmp_lg_u32 s17, 0
+; GFX9-NEXT: s_cselect_b32 s17, s20, s18
+; GFX9-NEXT: s_cselect_b32 s18, s21, s19
; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s9, s9, s18
+; GFX9-NEXT: s_subb_u32 s9, s9, s16
; GFX9-NEXT: s_cmp_ge_u32 s9, s7
; GFX9-NEXT: s_cselect_b32 s10, -1, 0
; GFX9-NEXT: s_cmp_ge_u32 s8, s6
@@ -8861,14 +8828,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cmp_eq_u32 s9, s7
; GFX9-NEXT: s_cselect_b32 s6, s6, s10
; GFX9-NEXT: s_cmp_lg_u32 s6, 0
-; GFX9-NEXT: s_cselect_b32 s7, s13, s17
-; GFX9-NEXT: s_cselect_b32 s6, s12, s16
+; GFX9-NEXT: s_cselect_b32 s7, s18, s15
+; GFX9-NEXT: s_cselect_b32 s6, s17, s14
; GFX9-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3]
; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], s[2:3]
; GFX9-NEXT: s_sub_u32 s2, s4, s2
; GFX9-NEXT: s_subb_u32 s3, s5, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s14
-; GFX9-NEXT: v_mov_b32_e32 v2, s15
+; GFX9-NEXT: v_mov_b32_e32 v1, s12
+; GFX9-NEXT: v_mov_b32_e32 v2, s13
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -9089,10 +9056,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_addc_u32 s13, 0, s14
; GFX6-NEXT: s_add_u32 s14, s0, s1
; GFX6-NEXT: v_mov_b32_e32 v0, s14
-; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0
+; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_cmp_lg_u32 s0, 0
; GFX6-NEXT: s_addc_u32 s12, s12, s13
; GFX6-NEXT: s_mul_i32 s0, s10, s12
; GFX6-NEXT: v_readfirstlane_b32 s1, v0
@@ -9123,7 +9089,6 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_add_u32 s13, s14, s0
; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_cmp_lg_u32 s0, 0
; GFX6-NEXT: s_addc_u32 s12, s12, s10
; GFX6-NEXT: s_ashr_i32 s10, s7, 31
; GFX6-NEXT: s_add_u32 s0, s6, s10
@@ -9158,46 +9123,43 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: v_readfirstlane_b32 s5, v0
; GFX6-NEXT: s_add_i32 s4, s5, s4
; GFX6-NEXT: s_mul_i32 s5, s9, s12
-; GFX6-NEXT: s_add_i32 s13, s4, s5
-; GFX6-NEXT: s_sub_i32 s14, s7, s13
+; GFX6-NEXT: s_add_i32 s14, s4, s5
+; GFX6-NEXT: s_sub_i32 s13, s7, s14
; GFX6-NEXT: s_mul_i32 s4, s8, s12
; GFX6-NEXT: s_sub_u32 s6, s6, s4
; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX6-NEXT: s_or_b32 s12, s4, s5
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_subb_u32 s14, s14, s9
-; GFX6-NEXT: s_sub_u32 s15, s6, s8
-; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX6-NEXT: s_subb_u32 s15, s13, s9
+; GFX6-NEXT: s_sub_u32 s16, s6, s8
+; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: s_or_b32 s17, s12, s13
+; GFX6-NEXT: s_subb_u32 s17, s15, 0
+; GFX6-NEXT: s_cmp_ge_u32 s17, s9
+; GFX6-NEXT: s_cselect_b32 s18, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s16, s8
+; GFX6-NEXT: s_cselect_b32 s19, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s17, s9
+; GFX6-NEXT: s_cselect_b32 s18, s19, s18
+; GFX6-NEXT: s_or_b32 s12, s12, s13
+; GFX6-NEXT: s_subb_u32 s15, s15, s9
+; GFX6-NEXT: s_sub_u32 s19, s16, s8
+; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: s_or_b32 s12, s12, s13
+; GFX6-NEXT: s_subb_u32 s12, s15, 0
+; GFX6-NEXT: s_cmp_lg_u32 s18, 0
+; GFX6-NEXT: s_cselect_b32 s13, s19, s16
+; GFX6-NEXT: s_cselect_b32 s12, s12, s17
; GFX6-NEXT: s_or_b32 s4, s4, s5
-; GFX6-NEXT: s_cmp_lg_u32 s4, 0
-; GFX6-NEXT: s_subb_u32 s16, s14, 0
-; GFX6-NEXT: s_cmp_ge_u32 s16, s9
+; GFX6-NEXT: s_subb_u32 s4, s7, s14
+; GFX6-NEXT: s_cmp_ge_u32 s4, s9
; GFX6-NEXT: s_cselect_b32 s5, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s15, s8
-; GFX6-NEXT: s_cselect_b32 s17, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s16, s9
-; GFX6-NEXT: s_cselect_b32 s17, s17, s5
-; GFX6-NEXT: s_cmp_lg_u32 s4, 0
-; GFX6-NEXT: s_subb_u32 s14, s14, s9
-; GFX6-NEXT: s_sub_u32 s18, s15, s8
-; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT: s_or_b32 s4, s4, s5
-; GFX6-NEXT: s_cmp_lg_u32 s4, 0
-; GFX6-NEXT: s_subb_u32 s4, s14, 0
-; GFX6-NEXT: s_cmp_lg_u32 s17, 0
-; GFX6-NEXT: s_cselect_b32 s14, s18, s15
-; GFX6-NEXT: s_cselect_b32 s4, s4, s16
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_subb_u32 s5, s7, s13
-; GFX6-NEXT: s_cmp_ge_u32 s5, s9
-; GFX6-NEXT: s_cselect_b32 s7, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s6, s8
-; GFX6-NEXT: s_cselect_b32 s8, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s5, s9
-; GFX6-NEXT: s_cselect_b32 s7, s8, s7
-; GFX6-NEXT: s_cmp_lg_u32 s7, 0
-; GFX6-NEXT: s_cselect_b32 s5, s4, s5
-; GFX6-NEXT: s_cselect_b32 s4, s14, s6
+; GFX6-NEXT: s_cselect_b32 s7, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s4, s9
+; GFX6-NEXT: s_cselect_b32 s5, s7, s5
+; GFX6-NEXT: s_cmp_lg_u32 s5, 0
+; GFX6-NEXT: s_cselect_b32 s5, s12, s4
+; GFX6-NEXT: s_cselect_b32 s4, s13, s6
; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11]
; GFX6-NEXT: s_sub_u32 s4, s4, s10
; GFX6-NEXT: s_subb_u32 s5, s5, s10
@@ -9219,8 +9181,8 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: s_sub_u32 s8, 0, s6
-; GFX9-NEXT: s_subb_u32 s9, 0, s7
+; GFX9-NEXT: s_sub_u32 s4, 0, s6
+; GFX9-NEXT: s_subb_u32 s5, 0, s7
; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -9230,56 +9192,52 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s10, v2
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: s_mul_i32 s5, s8, s10
-; GFX9-NEXT: s_mul_hi_u32 s12, s8, s4
-; GFX9-NEXT: s_mul_i32 s11, s9, s4
-; GFX9-NEXT: s_add_i32 s5, s12, s5
-; GFX9-NEXT: s_mul_i32 s13, s8, s4
-; GFX9-NEXT: s_add_i32 s5, s5, s11
-; GFX9-NEXT: s_mul_hi_u32 s12, s4, s13
-; GFX9-NEXT: s_mul_i32 s14, s4, s5
-; GFX9-NEXT: s_mul_hi_u32 s11, s4, s5
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: v_readfirstlane_b32 s9, v1
+; GFX9-NEXT: s_mul_i32 s10, s4, s8
+; GFX9-NEXT: s_mul_hi_u32 s12, s4, s9
+; GFX9-NEXT: s_mul_i32 s11, s5, s9
+; GFX9-NEXT: s_add_i32 s10, s12, s10
+; GFX9-NEXT: s_mul_i32 s13, s4, s9
+; GFX9-NEXT: s_add_i32 s10, s10, s11
+; GFX9-NEXT: s_mul_hi_u32 s12, s9, s13
+; GFX9-NEXT: s_mul_i32 s14, s9, s10
+; GFX9-NEXT: s_mul_hi_u32 s11, s9, s10
; GFX9-NEXT: s_add_u32 s12, s12, s14
; GFX9-NEXT: s_addc_u32 s11, 0, s11
-; GFX9-NEXT: s_mul_hi_u32 s15, s10, s13
-; GFX9-NEXT: s_mul_i32 s13, s10, s13
+; GFX9-NEXT: s_mul_hi_u32 s15, s8, s13
+; GFX9-NEXT: s_mul_i32 s13, s8, s13
; GFX9-NEXT: s_add_u32 s12, s12, s13
-; GFX9-NEXT: s_mul_hi_u32 s14, s10, s5
+; GFX9-NEXT: s_mul_hi_u32 s14, s8, s10
; GFX9-NEXT: s_addc_u32 s11, s11, s15
; GFX9-NEXT: s_addc_u32 s12, s14, 0
-; GFX9-NEXT: s_mul_i32 s5, s10, s5
-; GFX9-NEXT: s_add_u32 s5, s11, s5
+; GFX9-NEXT: s_mul_i32 s10, s8, s10
+; GFX9-NEXT: s_add_u32 s10, s11, s10
; GFX9-NEXT: s_addc_u32 s11, 0, s12
-; GFX9-NEXT: s_add_u32 s12, s4, s5
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s10, s10, s11
-; GFX9-NEXT: s_mul_i32 s4, s8, s10
-; GFX9-NEXT: s_mul_hi_u32 s5, s8, s12
-; GFX9-NEXT: s_add_i32 s4, s5, s4
-; GFX9-NEXT: s_mul_i32 s9, s9, s12
-; GFX9-NEXT: s_add_i32 s4, s4, s9
-; GFX9-NEXT: s_mul_i32 s8, s8, s12
-; GFX9-NEXT: s_mul_hi_u32 s9, s10, s8
-; GFX9-NEXT: s_mul_i32 s11, s10, s8
-; GFX9-NEXT: s_mul_i32 s14, s12, s4
-; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8
-; GFX9-NEXT: s_mul_hi_u32 s13, s12, s4
-; GFX9-NEXT: s_add_u32 s8, s8, s14
+; GFX9-NEXT: s_add_u32 s9, s9, s10
+; GFX9-NEXT: s_addc_u32 s8, s8, s11
+; GFX9-NEXT: s_mul_i32 s10, s4, s8
+; GFX9-NEXT: s_mul_hi_u32 s11, s4, s9
+; GFX9-NEXT: s_add_i32 s10, s11, s10
+; GFX9-NEXT: s_mul_i32 s5, s5, s9
+; GFX9-NEXT: s_add_i32 s10, s10, s5
+; GFX9-NEXT: s_mul_i32 s4, s4, s9
+; GFX9-NEXT: s_mul_hi_u32 s11, s8, s4
+; GFX9-NEXT: s_mul_i32 s12, s8, s4
+; GFX9-NEXT: s_mul_i32 s14, s9, s10
+; GFX9-NEXT: s_mul_hi_u32 s4, s9, s4
+; GFX9-NEXT: s_mul_hi_u32 s13, s9, s10
+; GFX9-NEXT: s_add_u32 s4, s4, s14
; GFX9-NEXT: s_addc_u32 s13, 0, s13
-; GFX9-NEXT: s_add_u32 s8, s8, s11
-; GFX9-NEXT: s_mul_hi_u32 s5, s10, s4
-; GFX9-NEXT: s_addc_u32 s8, s13, s9
+; GFX9-NEXT: s_add_u32 s4, s4, s12
+; GFX9-NEXT: s_mul_hi_u32 s5, s8, s10
+; GFX9-NEXT: s_addc_u32 s4, s13, s11
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_mul_i32 s4, s10, s4
-; GFX9-NEXT: s_add_u32 s4, s8, s4
-; GFX9-NEXT: s_addc_u32 s8, 0, s5
-; GFX9-NEXT: s_add_u32 s9, s12, s4
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s8, s10, s8
+; GFX9-NEXT: s_mul_i32 s10, s8, s10
+; GFX9-NEXT: s_add_u32 s4, s4, s10
+; GFX9-NEXT: s_addc_u32 s5, 0, s5
+; GFX9-NEXT: s_add_u32 s9, s9, s4
+; GFX9-NEXT: s_addc_u32 s8, s8, s5
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s4, s3, 31
; GFX9-NEXT: s_add_u32 s2, s2, s4
@@ -9309,11 +9267,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: s_mul_i32 s8, s6, s8
; GFX9-NEXT: s_sub_u32 s2, s2, s8
; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_subb_u32 s13, s10, s7
; GFX9-NEXT: s_sub_u32 s14, s2, s6
; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
; GFX9-NEXT: s_subb_u32 s15, s13, 0
; GFX9-NEXT: s_cmp_ge_u32 s15, s7
; GFX9-NEXT: s_cselect_b32 s16, -1, 0
@@ -9322,13 +9278,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: s_cmp_eq_u32 s15, s7
; GFX9-NEXT: s_cselect_b32 s16, s17, s16
; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s13, s13, s7
-; GFX9-NEXT: s_sub_u32 s17, s14, s6
-; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s10, s13, 0
+; GFX9-NEXT: s_subb_u32 s10, s13, s7
+; GFX9-NEXT: s_sub_u32 s11, s14, s6
+; GFX9-NEXT: s_subb_u32 s10, s10, 0
; GFX9-NEXT: s_cmp_lg_u32 s16, 0
-; GFX9-NEXT: s_cselect_b32 s11, s17, s14
+; GFX9-NEXT: s_cselect_b32 s11, s11, s14
; GFX9-NEXT: s_cselect_b32 s10, s10, s15
; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_subb_u32 s3, s3, s12
@@ -9490,10 +9444,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_addc_u32 s15, 0, s16
; GFX6-NEXT: s_add_u32 s16, s6, s7
; GFX6-NEXT: v_mov_b32_e32 v0, s16
-; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX6-NEXT: s_or_b32 s6, s6, s7
-; GFX6-NEXT: s_cmp_lg_u32 s6, 0
; GFX6-NEXT: s_addc_u32 s14, s14, s15
; GFX6-NEXT: s_mul_i32 s6, s12, s14
; GFX6-NEXT: v_readfirstlane_b32 s7, v0
@@ -9524,7 +9477,6 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_add_u32 s13, s16, s6
; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX6-NEXT: s_or_b32 s6, s6, s7
-; GFX6-NEXT: s_cmp_lg_u32 s6, 0
; GFX6-NEXT: s_addc_u32 s12, s14, s12
; GFX6-NEXT: s_ashr_i32 s6, s9, 31
; GFX6-NEXT: s_add_u32 s8, s8, s6
@@ -9557,49 +9509,46 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_readfirstlane_b32 s14, v0
; GFX6-NEXT: s_add_i32 s13, s14, s13
; GFX6-NEXT: s_mul_i32 s14, s3, s12
-; GFX6-NEXT: s_add_i32 s14, s13, s14
-; GFX6-NEXT: s_sub_i32 s15, s9, s14
+; GFX6-NEXT: s_add_i32 s16, s13, s14
+; GFX6-NEXT: s_sub_i32 s14, s9, s16
; GFX6-NEXT: s_mul_i32 s12, s2, s12
; GFX6-NEXT: s_sub_u32 s8, s8, s12
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s16, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s16, 0
-; GFX6-NEXT: s_subb_u32 s15, s15, s3
-; GFX6-NEXT: s_sub_u32 s17, s8, s2
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_subb_u32 s18, s15, 0
-; GFX6-NEXT: s_cmp_ge_u32 s18, s3
-; GFX6-NEXT: s_cselect_b32 s13, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s17, s2
-; GFX6-NEXT: s_cselect_b32 s19, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s18, s3
-; GFX6-NEXT: s_cselect_b32 s19, s19, s13
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_subb_u32 s15, s15, s3
-; GFX6-NEXT: s_sub_u32 s20, s17, s2
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: s_or_b32 s15, s12, s13
+; GFX6-NEXT: s_subb_u32 s17, s14, s3
+; GFX6-NEXT: s_sub_u32 s18, s8, s2
+; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT: s_or_b32 s19, s14, s15
+; GFX6-NEXT: s_subb_u32 s19, s17, 0
+; GFX6-NEXT: s_cmp_ge_u32 s19, s3
+; GFX6-NEXT: s_cselect_b32 s20, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s18, s2
+; GFX6-NEXT: s_cselect_b32 s21, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s19, s3
+; GFX6-NEXT: s_cselect_b32 s20, s21, s20
+; GFX6-NEXT: s_or_b32 s14, s14, s15
+; GFX6-NEXT: s_subb_u32 s17, s17, s3
+; GFX6-NEXT: s_sub_u32 s21, s18, s2
+; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT: s_or_b32 s14, s14, s15
+; GFX6-NEXT: s_subb_u32 s14, s17, 0
+; GFX6-NEXT: s_cmp_lg_u32 s20, 0
+; GFX6-NEXT: s_cselect_b32 s15, s21, s18
+; GFX6-NEXT: s_cselect_b32 s14, s14, s19
; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_subb_u32 s12, s15, 0
-; GFX6-NEXT: s_cmp_lg_u32 s19, 0
-; GFX6-NEXT: s_cselect_b32 s13, s20, s17
-; GFX6-NEXT: s_cselect_b32 s12, s12, s18
-; GFX6-NEXT: s_cmp_lg_u32 s16, 0
-; GFX6-NEXT: s_subb_u32 s9, s9, s14
+; GFX6-NEXT: s_subb_u32 s9, s9, s16
; GFX6-NEXT: s_cmp_ge_u32 s9, s3
-; GFX6-NEXT: s_cselect_b32 s14, -1, 0
+; GFX6-NEXT: s_cselect_b32 s12, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s8, s2
; GFX6-NEXT: s_cselect_b32 s2, -1, 0
; GFX6-NEXT: s_cmp_eq_u32 s9, s3
-; GFX6-NEXT: s_cselect_b32 s2, s2, s14
+; GFX6-NEXT: s_cselect_b32 s2, s2, s12
; GFX6-NEXT: s_cmp_lg_u32 s2, 0
-; GFX6-NEXT: s_cselect_b32 s3, s12, s9
-; GFX6-NEXT: s_cselect_b32 s2, s13, s8
+; GFX6-NEXT: s_cselect_b32 s3, s14, s9
+; GFX6-NEXT: s_cselect_b32 s2, s15, s8
; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7]
-; GFX6-NEXT: s_sub_u32 s12, s2, s6
-; GFX6-NEXT: s_subb_u32 s13, s3, s6
+; GFX6-NEXT: s_sub_u32 s14, s2, s6
+; GFX6-NEXT: s_subb_u32 s15, s3, s6
; GFX6-NEXT: s_ashr_i32 s2, s1, 31
; GFX6-NEXT: s_add_u32 s0, s0, s2
; GFX6-NEXT: s_mov_b32 s3, s2
@@ -9618,40 +9567,39 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0
-; GFX6-NEXT: v_readfirstlane_b32 s14, v1
+; GFX6-NEXT: v_readfirstlane_b32 s12, v1
; GFX6-NEXT: v_readfirstlane_b32 s2, v0
-; GFX6-NEXT: s_mul_i32 s1, s8, s14
+; GFX6-NEXT: s_mul_i32 s1, s8, s12
; GFX6-NEXT: v_readfirstlane_b32 s3, v2
; GFX6-NEXT: s_mul_i32 s0, s9, s2
; GFX6-NEXT: s_add_i32 s1, s3, s1
; GFX6-NEXT: s_add_i32 s3, s1, s0
-; GFX6-NEXT: s_mul_i32 s15, s8, s2
+; GFX6-NEXT: s_mul_i32 s13, s8, s2
; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15
+; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX6-NEXT: s_mul_i32 s4, s2, s3
; GFX6-NEXT: v_readfirstlane_b32 s5, v2
; GFX6-NEXT: v_readfirstlane_b32 s16, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15
+; GFX6-NEXT: v_mul_hi_u32 v0, v1, s13
; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3
; GFX6-NEXT: s_add_u32 s4, s16, s4
; GFX6-NEXT: s_addc_u32 s5, 0, s5
-; GFX6-NEXT: s_mul_i32 s15, s14, s15
+; GFX6-NEXT: s_mul_i32 s13, s12, s13
; GFX6-NEXT: v_readfirstlane_b32 s16, v0
-; GFX6-NEXT: s_add_u32 s4, s4, s15
+; GFX6-NEXT: s_add_u32 s4, s4, s13
; GFX6-NEXT: s_addc_u32 s4, s5, s16
; GFX6-NEXT: v_readfirstlane_b32 s5, v1
; GFX6-NEXT: s_addc_u32 s5, s5, 0
-; GFX6-NEXT: s_mul_i32 s3, s14, s3
+; GFX6-NEXT: s_mul_i32 s3, s12, s3
; GFX6-NEXT: s_add_u32 s3, s4, s3
; GFX6-NEXT: s_addc_u32 s4, 0, s5
; GFX6-NEXT: s_add_u32 s5, s2, s3
; GFX6-NEXT: v_mov_b32_e32 v0, s5
-; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
+; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_cmp_lg_u32 s2, 0
-; GFX6-NEXT: s_addc_u32 s4, s14, s4
+; GFX6-NEXT: s_addc_u32 s4, s12, s4
; GFX6-NEXT: s_mul_i32 s2, s8, s4
; GFX6-NEXT: v_readfirstlane_b32 s3, v0
; GFX6-NEXT: s_add_i32 s2, s3, s2
@@ -9665,102 +9613,98 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0
; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX6-NEXT: s_mul_i32 s9, s5, s2
-; GFX6-NEXT: v_readfirstlane_b32 s15, v2
-; GFX6-NEXT: s_add_u32 s9, s15, s9
-; GFX6-NEXT: v_readfirstlane_b32 s14, v0
+; GFX6-NEXT: v_readfirstlane_b32 s13, v2
+; GFX6-NEXT: s_add_u32 s9, s13, s9
+; GFX6-NEXT: v_readfirstlane_b32 s12, v0
; GFX6-NEXT: s_mul_i32 s3, s4, s3
-; GFX6-NEXT: s_addc_u32 s14, 0, s14
+; GFX6-NEXT: s_addc_u32 s12, 0, s12
; GFX6-NEXT: v_readfirstlane_b32 s8, v3
; GFX6-NEXT: s_add_u32 s3, s9, s3
-; GFX6-NEXT: s_addc_u32 s3, s14, s8
+; GFX6-NEXT: s_addc_u32 s3, s12, s8
; GFX6-NEXT: v_readfirstlane_b32 s8, v1
; GFX6-NEXT: s_addc_u32 s8, s8, 0
; GFX6-NEXT: s_mul_i32 s2, s4, s2
; GFX6-NEXT: s_add_u32 s2, s3, s2
; GFX6-NEXT: s_addc_u32 s8, 0, s8
-; GFX6-NEXT: s_add_u32 s14, s5, s2
+; GFX6-NEXT: s_add_u32 s12, s5, s2
; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_cmp_lg_u32 s2, 0
-; GFX6-NEXT: s_addc_u32 s15, s4, s8
+; GFX6-NEXT: s_addc_u32 s13, s4, s8
; GFX6-NEXT: s_ashr_i32 s4, s11, 31
; GFX6-NEXT: s_add_u32 s2, s10, s4
; GFX6-NEXT: s_mov_b32 s5, s4
; GFX6-NEXT: s_addc_u32 s3, s11, s4
; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v0, s15
+; GFX6-NEXT: v_mov_b32_e32 v0, s13
; GFX6-NEXT: v_mul_hi_u32 v1, s8, v0
-; GFX6-NEXT: v_mov_b32_e32 v2, s14
+; GFX6-NEXT: v_mov_b32_e32 v2, s12
; GFX6-NEXT: v_mul_hi_u32 v3, s8, v2
-; GFX6-NEXT: s_mul_i32 s2, s8, s15
+; GFX6-NEXT: s_mul_i32 s2, s8, s13
; GFX6-NEXT: v_readfirstlane_b32 s10, v1
; GFX6-NEXT: v_mul_hi_u32 v1, s9, v2
; GFX6-NEXT: v_readfirstlane_b32 s11, v3
; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0
; GFX6-NEXT: s_add_u32 s2, s11, s2
; GFX6-NEXT: s_addc_u32 s10, 0, s10
-; GFX6-NEXT: s_mul_i32 s11, s9, s14
-; GFX6-NEXT: v_readfirstlane_b32 s14, v1
+; GFX6-NEXT: s_mul_i32 s11, s9, s12
+; GFX6-NEXT: v_readfirstlane_b32 s12, v1
; GFX6-NEXT: s_add_u32 s2, s2, s11
-; GFX6-NEXT: s_addc_u32 s2, s10, s14
+; GFX6-NEXT: s_addc_u32 s2, s10, s12
; GFX6-NEXT: v_readfirstlane_b32 s10, v0
; GFX6-NEXT: s_addc_u32 s10, s10, 0
-; GFX6-NEXT: s_mul_i32 s11, s9, s15
+; GFX6-NEXT: s_mul_i32 s11, s9, s13
; GFX6-NEXT: s_add_u32 s11, s2, s11
; GFX6-NEXT: v_mov_b32_e32 v0, s11
; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
; GFX6-NEXT: s_addc_u32 s10, 0, s10
; GFX6-NEXT: s_mul_i32 s10, s6, s10
; GFX6-NEXT: s_mov_b32 s3, 0xf000
-; GFX6-NEXT: v_readfirstlane_b32 s14, v0
-; GFX6-NEXT: s_add_i32 s10, s14, s10
-; GFX6-NEXT: s_mul_i32 s14, s7, s11
-; GFX6-NEXT: s_add_i32 s14, s10, s14
-; GFX6-NEXT: s_sub_i32 s15, s9, s14
+; GFX6-NEXT: v_readfirstlane_b32 s12, v0
+; GFX6-NEXT: s_add_i32 s10, s12, s10
+; GFX6-NEXT: s_mul_i32 s12, s7, s11
+; GFX6-NEXT: s_add_i32 s16, s10, s12
+; GFX6-NEXT: s_sub_i32 s12, s9, s16
; GFX6-NEXT: s_mul_i32 s10, s6, s11
; GFX6-NEXT: s_sub_u32 s8, s8, s10
; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX6-NEXT: s_or_b32 s16, s10, s11
-; GFX6-NEXT: s_cmp_lg_u32 s16, 0
-; GFX6-NEXT: s_subb_u32 s15, s15, s7
-; GFX6-NEXT: s_sub_u32 s17, s8, s6
-; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX6-NEXT: s_or_b32 s10, s10, s11
-; GFX6-NEXT: s_cmp_lg_u32 s10, 0
-; GFX6-NEXT: s_subb_u32 s18, s15, 0
-; GFX6-NEXT: s_cmp_ge_u32 s18, s7
-; GFX6-NEXT: s_cselect_b32 s11, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s17, s6
-; GFX6-NEXT: s_cselect_b32 s19, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s18, s7
-; GFX6-NEXT: s_cselect_b32 s19, s19, s11
-; GFX6-NEXT: s_cmp_lg_u32 s10, 0
-; GFX6-NEXT: s_subb_u32 s15, s15, s7
-; GFX6-NEXT: s_sub_u32 s20, s17, s6
-; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GFX6-NEXT: s_or_b32 s13, s10, s11
+; GFX6-NEXT: s_subb_u32 s17, s12, s7
+; GFX6-NEXT: s_sub_u32 s18, s8, s6
+; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: s_or_b32 s19, s12, s13
+; GFX6-NEXT: s_subb_u32 s19, s17, 0
+; GFX6-NEXT: s_cmp_ge_u32 s19, s7
+; GFX6-NEXT: s_cselect_b32 s20, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s18, s6
+; GFX6-NEXT: s_cselect_b32 s21, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s19, s7
+; GFX6-NEXT: s_cselect_b32 s20, s21, s20
+; GFX6-NEXT: s_or_b32 s12, s12, s13
+; GFX6-NEXT: s_subb_u32 s17, s17, s7
+; GFX6-NEXT: s_sub_u32 s21, s18, s6
+; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: s_or_b32 s12, s12, s13
+; GFX6-NEXT: s_subb_u32 s12, s17, 0
+; GFX6-NEXT: s_cmp_lg_u32 s20, 0
+; GFX6-NEXT: s_cselect_b32 s13, s21, s18
+; GFX6-NEXT: s_cselect_b32 s12, s12, s19
; GFX6-NEXT: s_or_b32 s10, s10, s11
-; GFX6-NEXT: s_cmp_lg_u32 s10, 0
-; GFX6-NEXT: s_subb_u32 s10, s15, 0
-; GFX6-NEXT: s_cmp_lg_u32 s19, 0
-; GFX6-NEXT: s_cselect_b32 s11, s20, s17
-; GFX6-NEXT: s_cselect_b32 s10, s10, s18
-; GFX6-NEXT: s_cmp_lg_u32 s16, 0
-; GFX6-NEXT: s_subb_u32 s9, s9, s14
+; GFX6-NEXT: s_subb_u32 s9, s9, s16
; GFX6-NEXT: s_cmp_ge_u32 s9, s7
-; GFX6-NEXT: s_cselect_b32 s14, -1, 0
+; GFX6-NEXT: s_cselect_b32 s10, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s8, s6
; GFX6-NEXT: s_cselect_b32 s6, -1, 0
; GFX6-NEXT: s_cmp_eq_u32 s9, s7
-; GFX6-NEXT: s_cselect_b32 s6, s6, s14
+; GFX6-NEXT: s_cselect_b32 s6, s6, s10
; GFX6-NEXT: s_cmp_lg_u32 s6, 0
-; GFX6-NEXT: s_cselect_b32 s7, s10, s9
-; GFX6-NEXT: s_cselect_b32 s6, s11, s8
+; GFX6-NEXT: s_cselect_b32 s7, s12, s9
+; GFX6-NEXT: s_cselect_b32 s6, s13, s8
; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
; GFX6-NEXT: s_sub_u32 s5, s6, s4
; GFX6-NEXT: s_subb_u32 s4, s7, s4
; GFX6-NEXT: s_mov_b32 s2, -1
-; GFX6-NEXT: v_mov_b32_e32 v0, s12
-; GFX6-NEXT: v_mov_b32_e32 v1, s13
+; GFX6-NEXT: v_mov_b32_e32 v0, s14
+; GFX6-NEXT: v_mov_b32_e32 v1, s15
; GFX6-NEXT: v_mov_b32_e32 v2, s5
; GFX6-NEXT: v_mov_b32_e32 v3, s4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
@@ -9780,8 +9724,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7]
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
-; GFX9-NEXT: s_sub_u32 s12, 0, s2
-; GFX9-NEXT: s_subb_u32 s13, 0, s3
+; GFX9-NEXT: s_sub_u32 s6, 0, s2
+; GFX9-NEXT: s_subb_u32 s7, 0, s3
; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX9-NEXT: v_rcp_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -9790,56 +9734,52 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s14, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v0
-; GFX9-NEXT: s_mul_i32 s7, s12, s14
-; GFX9-NEXT: s_mul_hi_u32 s16, s12, s6
-; GFX9-NEXT: s_mul_i32 s15, s13, s6
-; GFX9-NEXT: s_add_i32 s7, s16, s7
-; GFX9-NEXT: s_mul_i32 s17, s12, s6
-; GFX9-NEXT: s_add_i32 s7, s7, s15
-; GFX9-NEXT: s_mul_hi_u32 s16, s6, s17
-; GFX9-NEXT: s_mul_i32 s18, s6, s7
-; GFX9-NEXT: s_mul_hi_u32 s15, s6, s7
+; GFX9-NEXT: v_readfirstlane_b32 s12, v1
+; GFX9-NEXT: v_readfirstlane_b32 s13, v0
+; GFX9-NEXT: s_mul_i32 s14, s6, s12
+; GFX9-NEXT: s_mul_hi_u32 s16, s6, s13
+; GFX9-NEXT: s_mul_i32 s15, s7, s13
+; GFX9-NEXT: s_add_i32 s14, s16, s14
+; GFX9-NEXT: s_mul_i32 s17, s6, s13
+; GFX9-NEXT: s_add_i32 s14, s14, s15
+; GFX9-NEXT: s_mul_hi_u32 s16, s13, s17
+; GFX9-NEXT: s_mul_i32 s18, s13, s14
+; GFX9-NEXT: s_mul_hi_u32 s15, s13, s14
; GFX9-NEXT: s_add_u32 s16, s16, s18
; GFX9-NEXT: s_addc_u32 s15, 0, s15
-; GFX9-NEXT: s_mul_hi_u32 s18, s14, s17
-; GFX9-NEXT: s_mul_i32 s17, s14, s17
+; GFX9-NEXT: s_mul_hi_u32 s18, s12, s17
+; GFX9-NEXT: s_mul_i32 s17, s12, s17
; GFX9-NEXT: s_add_u32 s16, s16, s17
-; GFX9-NEXT: s_mul_hi_u32 s19, s14, s7
+; GFX9-NEXT: s_mul_hi_u32 s19, s12, s14
; GFX9-NEXT: s_addc_u32 s15, s15, s18
; GFX9-NEXT: s_addc_u32 s16, s19, 0
-; GFX9-NEXT: s_mul_i32 s7, s14, s7
-; GFX9-NEXT: s_add_u32 s7, s15, s7
+; GFX9-NEXT: s_mul_i32 s14, s12, s14
+; GFX9-NEXT: s_add_u32 s14, s15, s14
; GFX9-NEXT: s_addc_u32 s15, 0, s16
-; GFX9-NEXT: s_add_u32 s16, s6, s7
-; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX9-NEXT: s_addc_u32 s14, s14, s15
-; GFX9-NEXT: s_mul_i32 s6, s12, s14
-; GFX9-NEXT: s_mul_hi_u32 s7, s12, s16
-; GFX9-NEXT: s_add_i32 s6, s7, s6
-; GFX9-NEXT: s_mul_i32 s13, s13, s16
-; GFX9-NEXT: s_add_i32 s6, s6, s13
-; GFX9-NEXT: s_mul_i32 s12, s12, s16
-; GFX9-NEXT: s_mul_hi_u32 s13, s14, s12
-; GFX9-NEXT: s_mul_i32 s15, s14, s12
-; GFX9-NEXT: s_mul_i32 s18, s16, s6
-; GFX9-NEXT: s_mul_hi_u32 s12, s16, s12
-; GFX9-NEXT: s_mul_hi_u32 s17, s16, s6
-; GFX9-NEXT: s_add_u32 s12, s12, s18
+; GFX9-NEXT: s_add_u32 s13, s13, s14
+; GFX9-NEXT: s_addc_u32 s12, s12, s15
+; GFX9-NEXT: s_mul_i32 s14, s6, s12
+; GFX9-NEXT: s_mul_hi_u32 s15, s6, s13
+; GFX9-NEXT: s_add_i32 s14, s15, s14
+; GFX9-NEXT: s_mul_i32 s7, s7, s13
+; GFX9-NEXT: s_add_i32 s14, s14, s7
+; GFX9-NEXT: s_mul_i32 s6, s6, s13
+; GFX9-NEXT: s_mul_hi_u32 s15, s12, s6
+; GFX9-NEXT: s_mul_i32 s16, s12, s6
+; GFX9-NEXT: s_mul_i32 s18, s13, s14
+; GFX9-NEXT: s_mul_hi_u32 s6, s13, s6
+; GFX9-NEXT: s_mul_hi_u32 s17, s13, s14
+; GFX9-NEXT: s_add_u32 s6, s6, s18
; GFX9-NEXT: s_addc_u32 s17, 0, s17
-; GFX9-NEXT: s_add_u32 s12, s12, s15
-; GFX9-NEXT: s_mul_hi_u32 s7, s14, s6
-; GFX9-NEXT: s_addc_u32 s12, s17, s13
+; GFX9-NEXT: s_add_u32 s6, s6, s16
+; GFX9-NEXT: s_mul_hi_u32 s7, s12, s14
+; GFX9-NEXT: s_addc_u32 s6, s17, s15
; GFX9-NEXT: s_addc_u32 s7, s7, 0
-; GFX9-NEXT: s_mul_i32 s6, s14, s6
-; GFX9-NEXT: s_add_u32 s6, s12, s6
-; GFX9-NEXT: s_addc_u32 s12, 0, s7
-; GFX9-NEXT: s_add_u32 s13, s16, s6
-; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX9-NEXT: s_addc_u32 s12, s14, s12
+; GFX9-NEXT: s_mul_i32 s14, s12, s14
+; GFX9-NEXT: s_add_u32 s6, s6, s14
+; GFX9-NEXT: s_addc_u32 s7, 0, s7
+; GFX9-NEXT: s_add_u32 s13, s13, s6
+; GFX9-NEXT: s_addc_u32 s12, s12, s7
; GFX9-NEXT: s_ashr_i32 s6, s9, 31
; GFX9-NEXT: s_add_u32 s8, s8, s6
; GFX9-NEXT: s_mov_b32 s7, s6
@@ -9868,11 +9808,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_mul_i32 s12, s2, s12
; GFX9-NEXT: s_sub_u32 s8, s8, s12
; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
; GFX9-NEXT: s_subb_u32 s17, s14, s3
; GFX9-NEXT: s_sub_u32 s18, s8, s2
; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
; GFX9-NEXT: s_subb_u32 s19, s17, 0
; GFX9-NEXT: s_cmp_ge_u32 s19, s3
; GFX9-NEXT: s_cselect_b32 s20, -1, 0
@@ -9881,13 +9819,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cmp_eq_u32 s19, s3
; GFX9-NEXT: s_cselect_b32 s20, s21, s20
; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT: s_subb_u32 s17, s17, s3
-; GFX9-NEXT: s_sub_u32 s21, s18, s2
-; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT: s_subb_u32 s14, s17, 0
+; GFX9-NEXT: s_subb_u32 s14, s17, s3
+; GFX9-NEXT: s_sub_u32 s15, s18, s2
+; GFX9-NEXT: s_subb_u32 s14, s14, 0
; GFX9-NEXT: s_cmp_lg_u32 s20, 0
-; GFX9-NEXT: s_cselect_b32 s15, s21, s18
+; GFX9-NEXT: s_cselect_b32 s15, s15, s18
; GFX9-NEXT: s_cselect_b32 s14, s14, s19
; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
; GFX9-NEXT: s_subb_u32 s9, s9, s16
@@ -9911,8 +9847,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT: s_sub_u32 s6, 0, s2
-; GFX9-NEXT: s_subb_u32 s7, 0, s3
+; GFX9-NEXT: s_sub_u32 s4, 0, s2
+; GFX9-NEXT: s_subb_u32 s5, 0, s3
; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -9922,74 +9858,70 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: v_readfirstlane_b32 s6, v1
; GFX9-NEXT: v_readfirstlane_b32 s9, v2
-; GFX9-NEXT: s_mul_hi_u32 s8, s6, s4
-; GFX9-NEXT: s_mul_i32 s14, s6, s9
-; GFX9-NEXT: s_mul_i32 s5, s7, s4
+; GFX9-NEXT: s_mul_hi_u32 s8, s4, s6
+; GFX9-NEXT: s_mul_i32 s14, s4, s9
+; GFX9-NEXT: s_mul_i32 s7, s5, s6
; GFX9-NEXT: s_add_i32 s8, s8, s14
-; GFX9-NEXT: s_add_i32 s8, s8, s5
-; GFX9-NEXT: s_mul_i32 s15, s6, s4
-; GFX9-NEXT: s_mul_i32 s14, s4, s8
-; GFX9-NEXT: s_mul_hi_u32 s16, s4, s15
-; GFX9-NEXT: s_mul_hi_u32 s5, s4, s8
+; GFX9-NEXT: s_add_i32 s8, s8, s7
+; GFX9-NEXT: s_mul_i32 s15, s4, s6
+; GFX9-NEXT: s_mul_i32 s14, s6, s8
+; GFX9-NEXT: s_mul_hi_u32 s16, s6, s15
+; GFX9-NEXT: s_mul_hi_u32 s7, s6, s8
; GFX9-NEXT: s_add_u32 s14, s16, s14
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
+; GFX9-NEXT: s_addc_u32 s7, 0, s7
; GFX9-NEXT: s_mul_hi_u32 s17, s9, s15
; GFX9-NEXT: s_mul_i32 s15, s9, s15
; GFX9-NEXT: s_add_u32 s14, s14, s15
; GFX9-NEXT: s_mul_hi_u32 s16, s9, s8
-; GFX9-NEXT: s_addc_u32 s5, s5, s17
+; GFX9-NEXT: s_addc_u32 s7, s7, s17
; GFX9-NEXT: s_addc_u32 s14, s16, 0
; GFX9-NEXT: s_mul_i32 s8, s9, s8
-; GFX9-NEXT: s_add_u32 s5, s5, s8
+; GFX9-NEXT: s_add_u32 s7, s7, s8
; GFX9-NEXT: s_addc_u32 s8, 0, s14
-; GFX9-NEXT: s_add_u32 s14, s4, s5
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s8, s9, s8
-; GFX9-NEXT: s_mul_i32 s4, s6, s8
-; GFX9-NEXT: s_mul_hi_u32 s5, s6, s14
-; GFX9-NEXT: s_add_i32 s4, s5, s4
-; GFX9-NEXT: s_mul_i32 s7, s7, s14
-; GFX9-NEXT: s_add_i32 s4, s4, s7
-; GFX9-NEXT: s_mul_i32 s6, s6, s14
-; GFX9-NEXT: s_mul_hi_u32 s7, s8, s6
-; GFX9-NEXT: s_mul_i32 s9, s8, s6
-; GFX9-NEXT: s_mul_i32 s16, s14, s4
-; GFX9-NEXT: s_mul_hi_u32 s6, s14, s6
-; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4
-; GFX9-NEXT: s_add_u32 s6, s6, s16
+; GFX9-NEXT: s_add_u32 s6, s6, s7
+; GFX9-NEXT: s_addc_u32 s7, s9, s8
+; GFX9-NEXT: s_mul_i32 s8, s4, s7
+; GFX9-NEXT: s_mul_hi_u32 s9, s4, s6
+; GFX9-NEXT: s_add_i32 s8, s9, s8
+; GFX9-NEXT: s_mul_i32 s5, s5, s6
+; GFX9-NEXT: s_add_i32 s8, s8, s5
+; GFX9-NEXT: s_mul_i32 s4, s4, s6
+; GFX9-NEXT: s_mul_hi_u32 s9, s7, s4
+; GFX9-NEXT: s_mul_i32 s14, s7, s4
+; GFX9-NEXT: s_mul_i32 s16, s6, s8
+; GFX9-NEXT: s_mul_hi_u32 s4, s6, s4
+; GFX9-NEXT: s_mul_hi_u32 s15, s6, s8
+; GFX9-NEXT: s_add_u32 s4, s4, s16
; GFX9-NEXT: s_addc_u32 s15, 0, s15
-; GFX9-NEXT: s_add_u32 s6, s6, s9
-; GFX9-NEXT: s_mul_hi_u32 s5, s8, s4
-; GFX9-NEXT: s_addc_u32 s6, s15, s7
+; GFX9-NEXT: s_add_u32 s4, s4, s14
+; GFX9-NEXT: s_mul_hi_u32 s5, s7, s8
+; GFX9-NEXT: s_addc_u32 s4, s15, s9
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_mul_i32 s4, s8, s4
-; GFX9-NEXT: s_add_u32 s4, s6, s4
-; GFX9-NEXT: s_addc_u32 s6, 0, s5
-; GFX9-NEXT: s_add_u32 s9, s14, s4
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s8, s8, s6
+; GFX9-NEXT: s_mul_i32 s8, s7, s8
+; GFX9-NEXT: s_add_u32 s4, s4, s8
+; GFX9-NEXT: s_addc_u32 s5, 0, s5
+; GFX9-NEXT: s_add_u32 s8, s6, s4
+; GFX9-NEXT: s_addc_u32 s9, s7, s5
; GFX9-NEXT: s_ashr_i32 s4, s11, 31
; GFX9-NEXT: s_add_u32 s6, s10, s4
; GFX9-NEXT: s_mov_b32 s5, s4
; GFX9-NEXT: s_addc_u32 s7, s11, s4
; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
-; GFX9-NEXT: s_mul_i32 s11, s6, s8
-; GFX9-NEXT: s_mul_hi_u32 s14, s6, s9
-; GFX9-NEXT: s_mul_hi_u32 s10, s6, s8
+; GFX9-NEXT: s_mul_i32 s11, s6, s9
+; GFX9-NEXT: s_mul_hi_u32 s14, s6, s8
+; GFX9-NEXT: s_mul_hi_u32 s10, s6, s9
; GFX9-NEXT: s_add_u32 s11, s14, s11
; GFX9-NEXT: s_addc_u32 s10, 0, s10
-; GFX9-NEXT: s_mul_hi_u32 s15, s7, s9
-; GFX9-NEXT: s_mul_i32 s9, s7, s9
-; GFX9-NEXT: s_add_u32 s9, s11, s9
-; GFX9-NEXT: s_mul_hi_u32 s14, s7, s8
-; GFX9-NEXT: s_addc_u32 s9, s10, s15
-; GFX9-NEXT: s_addc_u32 s10, s14, 0
+; GFX9-NEXT: s_mul_hi_u32 s15, s7, s8
; GFX9-NEXT: s_mul_i32 s8, s7, s8
-; GFX9-NEXT: s_add_u32 s8, s9, s8
+; GFX9-NEXT: s_add_u32 s8, s11, s8
+; GFX9-NEXT: s_mul_hi_u32 s14, s7, s9
+; GFX9-NEXT: s_addc_u32 s8, s10, s15
+; GFX9-NEXT: s_addc_u32 s10, s14, 0
+; GFX9-NEXT: s_mul_i32 s9, s7, s9
+; GFX9-NEXT: s_add_u32 s8, s8, s9
; GFX9-NEXT: s_addc_u32 s9, 0, s10
; GFX9-NEXT: s_mul_i32 s9, s2, s9
; GFX9-NEXT: s_mul_hi_u32 s10, s2, s8
@@ -10000,11 +9932,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_mul_i32 s8, s2, s8
; GFX9-NEXT: s_sub_u32 s6, s6, s8
; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_subb_u32 s15, s10, s3
; GFX9-NEXT: s_sub_u32 s16, s6, s2
; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
; GFX9-NEXT: s_subb_u32 s17, s15, 0
; GFX9-NEXT: s_cmp_ge_u32 s17, s3
; GFX9-NEXT: s_cselect_b32 s18, -1, 0
@@ -10013,13 +9943,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cmp_eq_u32 s17, s3
; GFX9-NEXT: s_cselect_b32 s18, s19, s18
; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s15, s15, s3
-; GFX9-NEXT: s_sub_u32 s19, s16, s2
-; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s10, s15, 0
+; GFX9-NEXT: s_subb_u32 s10, s15, s3
+; GFX9-NEXT: s_sub_u32 s11, s16, s2
+; GFX9-NEXT: s_subb_u32 s10, s10, 0
; GFX9-NEXT: s_cmp_lg_u32 s18, 0
-; GFX9-NEXT: s_cselect_b32 s11, s19, s16
+; GFX9-NEXT: s_cselect_b32 s11, s11, s16
; GFX9-NEXT: s_cselect_b32 s10, s10, s17
; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_subb_u32 s7, s7, s14
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 394727c..01f4414 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -612,12 +612,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_readlane_b32 s6, v0, s3
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s8
+; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -653,12 +652,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_readlane_b32 s6, v0, s3
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s8
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -693,11 +691,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
-; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -733,11 +730,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -774,11 +770,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
-; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -818,11 +813,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -859,11 +853,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -901,15 +894,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -999,12 +992,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_readlane_b32 s6, v0, s3
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s8
+; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1042,12 +1034,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_readlane_b32 s6, v0, s3
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s8
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1084,11 +1075,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
-; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1127,11 +1117,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1171,11 +1160,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
-; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1218,11 +1206,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1261,11 +1248,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX12W64-NEXT: s_cbranch_scc1 .LBB3_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1306,15 +1292,15 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB3_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX12W32-NEXT: s_cbranch_scc1 .LBB3_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2073,12 +2059,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_readlane_b32 s6, v0, s3
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s8
+; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB7_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2114,12 +2099,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_readlane_b32 s6, v0, s3
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s8
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2154,11 +2138,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
-; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2194,11 +2177,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2235,11 +2217,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
-; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2279,11 +2260,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2321,11 +2301,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2363,15 +2342,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 258bc295..9db6d70 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -717,12 +717,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0
-; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
+; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s3
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -762,12 +761,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0
-; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
+; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s3
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -805,13 +803,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1
; GFX1064_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[0:1]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -853,11 +850,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s3
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032_ITERATIVE-NEXT: s_add_i32 s6, s6, s2
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -897,14 +893,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1164_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -949,11 +944,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1132_ITERATIVE-NEXT: s_add_i32 s6, s6, s2
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -993,14 +987,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1264_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1]
+; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
+; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s7
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7
-; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8
-; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -1028,6 +1022,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xf1ff
; GFX1264_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null
@@ -1041,15 +1036,15 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1232_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1
-; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
+; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -2363,7 +2358,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2416,7 +2410,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2462,13 +2455,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s2
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2
; GFX1064_ITERATIVE-NEXT: s_add_u32 s6, s6, s3
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2515,13 +2507,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
-; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s1
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1
; GFX1032_ITERATIVE-NEXT: s_add_u32 s6, s6, s2
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2569,14 +2560,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s2
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
; GFX1164_ITERATIVE-NEXT: s_add_u32 s6, s6, s3
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -2626,14 +2616,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
-; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
; GFX1132_ITERATIVE-NEXT: s_add_u32 s6, s6, s2
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -2677,16 +2666,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1264_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1]
+; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1]
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8
+; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s10
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s10
-; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s10
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s10
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9]
; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
-; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -2731,17 +2720,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
-; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8
-; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
+; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -4490,12 +4479,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0
-; GFX8_ITERATIVE-NEXT: s_add_i32 s12, s12, s6
+; GFX8_ITERATIVE-NEXT: s_add_i32 s12, s12, s3
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4550,12 +4538,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0
-; GFX9_ITERATIVE-NEXT: s_add_i32 s12, s12, s6
+; GFX9_ITERATIVE-NEXT: s_add_i32 s12, s12, s3
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4608,13 +4595,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr2
; GFX1064_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s6
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s6
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s2
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1064_ITERATIVE-NEXT: s_add_i32 s12, s12, s6
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT: s_add_i32 s12, s12, s7
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4670,11 +4656,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s8, s1
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s3
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032_ITERATIVE-NEXT: s_add_i32 s8, s8, s2
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4728,14 +4713,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2
; GFX1164_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[0:1]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s6
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s6
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s2
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164_ITERATIVE-NEXT: s_add_i32 s12, s12, s6
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_add_i32 s12, s12, s7
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -4799,11 +4783,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s8, s1
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1132_ITERATIVE-NEXT: s_add_i32 s8, s8, s2
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -4861,14 +4844,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1264_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1]
+; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
+; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s7
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7
-; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8
-; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -4896,6 +4879,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xf1ff
; GFX1264_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null
@@ -4909,15 +4893,15 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1232_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1
-; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
+; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -6673,7 +6657,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6746,7 +6729,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6812,13 +6794,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2
; GFX1064_ITERATIVE-NEXT: s_add_u32 s8, s8, s3
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s9, s9, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6883,13 +6864,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s1
-; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1
; GFX1032_ITERATIVE-NEXT: s_add_u32 s8, s8, s2
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6955,14 +6935,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2
; GFX1164_ITERATIVE-NEXT: s_add_u32 s8, s8, s3
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s9, s9, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -7036,14 +7015,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s1
-; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1
; GFX1132_ITERATIVE-NEXT: s_add_u32 s8, s8, s2
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -7109,16 +7087,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1264_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1]
+; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1]
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8
+; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s10
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s10
-; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s10
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s10
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9]
; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
-; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -7163,17 +7141,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
-; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8
-; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
+; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 23c5f4f..6167a84 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -499,12 +499,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -540,12 +539,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -580,11 +578,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -621,11 +618,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -663,11 +659,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -707,11 +702,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1088,11 +1082,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX8_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
+; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s4
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1117,11 +1110,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX9_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
+; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s4
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1147,9 +1139,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1176,9 +1167,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1206,10 +1196,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1239,10 +1227,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2022,7 +2008,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2071,7 +2056,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2112,13 +2096,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6
; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2160,13 +2143,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2209,14 +2191,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -2261,14 +2242,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -2881,7 +2861,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2914,7 +2893,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2946,7 +2924,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2979,7 +2956,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3013,8 +2989,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3048,9 +3022,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s4
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3906,12 +3879,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3947,12 +3919,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3987,11 +3958,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4028,11 +3998,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4070,11 +4039,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -4114,11 +4082,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -4495,11 +4462,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX8_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
+; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s4
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4524,11 +4490,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX9_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
+; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s4
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4554,9 +4519,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4583,9 +4547,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4613,10 +4576,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4646,10 +4607,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5452,7 +5411,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5501,7 +5459,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5542,13 +5499,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6
; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5590,13 +5546,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5639,14 +5594,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -5691,14 +5645,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -6313,12 +6266,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_and_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6354,12 +6306,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_and_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6394,11 +6345,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6435,11 +6385,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_and_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6477,11 +6426,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -6521,11 +6469,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_and_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -6926,12 +6873,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX8_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6973,12 +6919,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX9_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7015,15 +6960,14 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1064_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1064_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7065,12 +7009,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7109,16 +7052,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1164_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -7163,12 +7105,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1132_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -7672,12 +7613,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_or_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7713,12 +7653,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_or_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7753,11 +7692,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7794,11 +7732,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_or_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7836,11 +7773,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -7880,11 +7816,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_or_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -8284,12 +8219,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX8_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8331,12 +8265,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX9_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8373,15 +8306,14 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1064_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1064_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8423,12 +8355,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8467,16 +8398,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1164_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8521,12 +8451,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1132_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9030,12 +8959,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_xor_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9071,12 +8999,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_xor_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9111,11 +9038,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9152,11 +9078,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9194,11 +9119,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -9238,11 +9162,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -9642,12 +9565,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9689,12 +9611,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9731,15 +9652,14 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1064_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9781,12 +9701,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9825,16 +9744,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9879,12 +9797,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1132_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -10388,12 +10305,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_max_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10429,12 +10345,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_max_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10469,11 +10384,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10510,11 +10424,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_max_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10552,11 +10465,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -10596,11 +10508,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_max_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -11255,7 +11166,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11311,7 +11221,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11363,7 +11272,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11415,7 +11323,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11468,9 +11375,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -11525,9 +11431,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -12214,12 +12119,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_min_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12255,12 +12159,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_min_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12295,11 +12198,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12336,11 +12238,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_min_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12378,11 +12279,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -12422,11 +12322,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_min_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -13081,7 +12980,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13137,7 +13035,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13189,7 +13086,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13241,7 +13137,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13294,9 +13189,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -13351,9 +13245,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -14040,12 +13933,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_max_u32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14081,12 +13973,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_max_u32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14121,11 +14012,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14162,11 +14052,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_max_u32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14204,11 +14093,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -14248,11 +14136,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_max_u32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -14901,7 +14788,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14956,7 +14842,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15007,7 +14892,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15058,7 +14942,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15112,8 +14995,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -15169,8 +15050,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -15853,12 +15732,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_min_u32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15894,12 +15772,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_min_u32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15934,11 +15811,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15975,11 +15851,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_min_u32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16017,11 +15892,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -16061,11 +15935,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_min_u32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -16715,7 +16588,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16770,7 +16642,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16821,7 +16692,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16872,7 +16742,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16926,8 +16795,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -16983,8 +16850,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index e4def28..9afc0c6 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -611,12 +611,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_readlane_b32 s6, v0, s3
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s8
+; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -652,12 +651,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_readlane_b32 s6, v0, s3
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s8
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -692,11 +690,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
-; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -732,11 +729,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -773,11 +769,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
-; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -817,11 +812,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -858,11 +852,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -900,15 +893,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1665,12 +1658,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_readlane_b32 s6, v0, s3
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s8
+; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB6_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1706,12 +1698,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_readlane_b32 s6, v0, s3
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s8
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB6_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1746,11 +1737,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
-; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1786,11 +1776,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1827,11 +1816,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
-; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1871,11 +1859,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1913,11 +1900,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX12W64-NEXT: s_cbranch_scc1 .LBB6_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1955,15 +1941,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB6_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX12W32-NEXT: s_cbranch_scc1 .LBB6_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 39a3c9a..10fd34f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -628,12 +628,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_readlane_b32 s6, v0, s3
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s8
+; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -670,12 +669,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_readlane_b32 s6, v0, s3
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s8
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -711,11 +709,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
-; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -752,11 +749,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -794,11 +790,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
-; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -839,11 +834,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -880,11 +874,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -923,15 +916,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1833,12 +1826,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_readlane_b32 s6, v0, s3
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s8
+; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB7_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1875,12 +1867,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_readlane_b32 s6, v0, s3
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s8
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1916,11 +1907,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
-; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1957,11 +1947,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1999,11 +1988,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
-; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2044,11 +2032,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2086,11 +2073,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2129,15 +2115,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 7ee0015f..711d57b 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -39137,7 +39137,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v2
; GFX1250-NEXT: v_add_nc_u32_e32 v2, 32, v2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_min_u32_e64 v2, v3, -1, v2
+; GFX1250-NEXT: v_add_min_u32 v2, v3, -1, v2
; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v2, v[0:1]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0
@@ -39487,8 +39487,8 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
; GFX1250-NEXT: v_dual_ashrrev_i32 v5, 31, v5 :: v_dual_ashrrev_i32 v4, 31, v4
; GFX1250-NEXT: v_dual_add_nc_u32 v5, 32, v5 :: v_dual_add_nc_u32 v4, 32, v4
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_add_min_u32_e64 v5, v7, -1, v5
-; GFX1250-NEXT: v_add_min_u32_e64 v4, v6, -1, v4
+; GFX1250-NEXT: v_add_min_u32 v5, v7, -1, v5
+; GFX1250-NEXT: v_add_min_u32 v4, v6, -1, v4
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v5, v[0:1]
; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v4, v[2:3]
@@ -39979,9 +39979,9 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GFX1250TRUE16-NEXT: v_dual_add_nc_u32 v7, 32, v7 :: v_dual_add_nc_u32 v6, 32, v6
; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250TRUE16-NEXT: v_ashrrev_i32_e32 v8, 31, v8
-; GFX1250TRUE16-NEXT: v_add_min_u32_e64 v7, v10, -1, v7
+; GFX1250TRUE16-NEXT: v_add_min_u32 v7, v10, -1, v7
; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT: v_add_min_u32_e64 v6, v9, -1, v6
+; GFX1250TRUE16-NEXT: v_add_min_u32 v6, v9, -1, v6
; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[2:3], v7, v[2:3]
; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[4:5], v6, v[4:5]
@@ -39991,7 +39991,7 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GFX1250TRUE16-NEXT: v_min_u32_e32 v4, 1, v4
; GFX1250TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250TRUE16-NEXT: v_add_min_u32_e64 v8, v11, -1, v8
+; GFX1250TRUE16-NEXT: v_add_min_u32 v8, v11, -1, v8
; GFX1250TRUE16-NEXT: v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54
; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2
@@ -40027,8 +40027,8 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GFX1250FAKE16-NEXT: v_dual_ashrrev_i32 v6, 31, v6 :: v_dual_ashrrev_i32 v7, 31, v7
; GFX1250FAKE16-NEXT: v_dual_add_nc_u32 v6, 32, v6 :: v_dual_add_nc_u32 v7, 32, v7
; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250FAKE16-NEXT: v_add_min_u32_e64 v6, v10, -1, v6
-; GFX1250FAKE16-NEXT: v_add_min_u32_e64 v7, v11, -1, v7
+; GFX1250FAKE16-NEXT: v_add_min_u32 v6, v10, -1, v6
+; GFX1250FAKE16-NEXT: v_add_min_u32 v7, v11, -1, v7
; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[2:3], v6, v[2:3]
; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[0:1], v7, v[0:1]
@@ -40038,7 +40038,7 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GFX1250FAKE16-NEXT: v_min_u32_e32 v0, 1, v0
; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250FAKE16-NEXT: v_or_b32_e32 v2, v3, v2
-; GFX1250FAKE16-NEXT: v_add_min_u32_e64 v8, v9, -1, v8
+; GFX1250FAKE16-NEXT: v_add_min_u32 v8, v9, -1, v8
; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250FAKE16-NEXT: v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
; GFX1250FAKE16-NEXT: v_cvt_f32_i32_e32 v2, v2
@@ -40656,18 +40656,18 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
; GFX1250-NEXT: v_dual_add_nc_u32 v9, 32, v9 :: v_dual_add_nc_u32 v8, 32, v8
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_dual_ashrrev_i32 v10, 31, v10 :: v_dual_bitop2_b32 v11, v0, v1 bitop3:0x14
-; GFX1250-NEXT: v_add_min_u32_e64 v9, v13, -1, v9
+; GFX1250-NEXT: v_add_min_u32 v9, v13, -1, v9
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_add_min_u32_e64 v8, v12, -1, v8
+; GFX1250-NEXT: v_add_min_u32 v8, v12, -1, v8
; GFX1250-NEXT: v_dual_ashrrev_i32 v11, 31, v11 :: v_dual_add_nc_u32 v10, 32, v10
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_lshlrev_b64_e32 v[4:5], v9, v[4:5]
; GFX1250-NEXT: v_lshlrev_b64_e32 v[6:7], v8, v[6:7]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_add_nc_u32_e32 v11, 32, v11
-; GFX1250-NEXT: v_add_min_u32_e64 v10, v14, -1, v10
+; GFX1250-NEXT: v_add_min_u32 v10, v14, -1, v10
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_add_min_u32_e64 v11, v15, -1, v11
+; GFX1250-NEXT: v_add_min_u32 v11, v15, -1, v11
; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v10, v[2:3]
; GFX1250-NEXT: v_min_u32_e32 v6, 1, v6
; GFX1250-NEXT: v_min_u32_e32 v4, 1, v4
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 4a6fa4f..b96de17 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -704,7 +704,6 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI-NEXT: s_add_u32 s4, s4, s6
; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0
; CISI-NEXT: s_or_b32 s6, s12, s13
-; CISI-NEXT: s_cmp_lg_u32 s6, 0
; CISI-NEXT: s_addc_u32 s5, s5, s7
; CISI-NEXT: s_mov_b32 s8, s0
; CISI-NEXT: s_mov_b32 s9, s1
@@ -725,16 +724,14 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: s_add_u32 s2, s4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_add_u32 s0, s4, s6
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_addc_u32 s1, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT: s_addc_u32 s0, s5, s7
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: v_mov_b32_e32 v5, s0
-; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -746,12 +743,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s2, s12, s14
-; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: s_addc_u32 s0, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_add_u32 s0, s12, s14
+; GFX9-NEXT: s_addc_u32 s1, s13, s15
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -764,10 +759,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: s_add_u32 s0, s12, s14
-; GFX1010-NEXT: s_cselect_b32 s1, -1, 0
-; GFX1010-NEXT: v_mov_b32_e32 v0, s0
-; GFX1010-NEXT: s_cmp_lg_u32 s1, 0
; GFX1010-NEXT: s_addc_u32 s1, s13, s15
+; GFX1010-NEXT: v_mov_b32_e32 v0, s0
; GFX1010-NEXT: s_cselect_b32 s0, -1, 0
; GFX1010-NEXT: v_mov_b32_e32 v1, s1
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -781,10 +774,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W32-NEXT: s_add_u32 s4, s4, s6
-; GFX1030W32-NEXT: s_cselect_b32 s6, -1, 0
-; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030W32-NEXT: s_cmp_lg_u32 s6, 0
; GFX1030W32-NEXT: s_addc_u32 s5, s5, s7
+; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4
; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0
; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
@@ -798,10 +789,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W64-NEXT: s_add_u32 s4, s4, s6
-; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX1030W64-NEXT: s_addc_u32 s5, s5, s7
+; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4
; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5
; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
@@ -814,10 +803,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s4, s4, s6
-; GFX11-NEXT: s_cselect_b32 s6, -1, 0
-; GFX11-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-NEXT: s_cmp_lg_u32 s6, 0
; GFX11-NEXT: s_addc_u32 s5, s5, s7
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
@@ -831,10 +818,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_add_co_u32 s0, s12, s14
-; GFX1250-NEXT: s_cselect_b32 s1, -1, 0
-; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
-; GFX1250-NEXT: s_cmp_lg_u32 s1, 0
; GFX1250-NEXT: s_add_co_ci_u32 s1, s13, s15
+; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
; GFX1250-NEXT: s_cselect_b32 s0, -1, 0
; GFX1250-NEXT: v_mov_b32_e32 v1, s1
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -1691,7 +1676,6 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI-NEXT: s_sub_u32 s4, s4, s6
; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0
; CISI-NEXT: s_or_b32 s6, s12, s13
-; CISI-NEXT: s_cmp_lg_u32 s6, 0
; CISI-NEXT: s_subb_u32 s5, s5, s7
; CISI-NEXT: s_mov_b32 s8, s0
; CISI-NEXT: s_mov_b32 s9, s1
@@ -1712,16 +1696,14 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: s_sub_u32 s2, s4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_sub_u32 s0, s4, s6
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_subb_u32 s1, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT: s_subb_u32 s0, s5, s7
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: v_mov_b32_e32 v5, s0
-; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -1733,12 +1715,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s2, s12, s14
-; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: s_subb_u32 s0, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_sub_u32 s0, s12, s14
+; GFX9-NEXT: s_subb_u32 s1, s13, s15
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -1751,10 +1731,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: s_sub_u32 s0, s12, s14
-; GFX1010-NEXT: s_cselect_b32 s1, -1, 0
-; GFX1010-NEXT: v_mov_b32_e32 v0, s0
-; GFX1010-NEXT: s_cmp_lg_u32 s1, 0
; GFX1010-NEXT: s_subb_u32 s1, s13, s15
+; GFX1010-NEXT: v_mov_b32_e32 v0, s0
; GFX1010-NEXT: s_cselect_b32 s0, -1, 0
; GFX1010-NEXT: v_mov_b32_e32 v1, s1
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -1768,10 +1746,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W32-NEXT: s_sub_u32 s4, s4, s6
-; GFX1030W32-NEXT: s_cselect_b32 s6, -1, 0
-; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030W32-NEXT: s_cmp_lg_u32 s6, 0
; GFX1030W32-NEXT: s_subb_u32 s5, s5, s7
+; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4
; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0
; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
@@ -1785,10 +1761,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W64-NEXT: s_sub_u32 s4, s4, s6
-; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX1030W64-NEXT: s_subb_u32 s5, s5, s7
+; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4
; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5
; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
@@ -1801,10 +1775,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_u32 s4, s4, s6
-; GFX11-NEXT: s_cselect_b32 s6, -1, 0
-; GFX11-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-NEXT: s_cmp_lg_u32 s6, 0
; GFX11-NEXT: s_subb_u32 s5, s5, s7
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
@@ -1818,10 +1790,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_sub_co_u32 s0, s12, s14
-; GFX1250-NEXT: s_cselect_b32 s1, -1, 0
-; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
-; GFX1250-NEXT: s_cmp_lg_u32 s1, 0
; GFX1250-NEXT: s_sub_co_ci_u32 s1, s13, s15
+; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
; GFX1250-NEXT: s_cselect_b32 s0, -1, 0
; GFX1250-NEXT: v_mov_b32_e32 v1, s1
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -2218,49 +2188,46 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; VI-NEXT: s_addc_u32 s6, s7, s9
; VI-NEXT: s_addc_u32 s8, s8, 0
; VI-NEXT: v_readfirstlane_b32 s7, v0
-; VI-NEXT: s_add_u32 s12, s6, s7
-; VI-NEXT: v_mov_b32_e32 v0, s12
+; VI-NEXT: s_add_u32 s10, s6, s7
+; VI-NEXT: v_mov_b32_e32 v0, s10
; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s4, v0, 0
-; VI-NEXT: s_addc_u32 s13, 0, s8
-; VI-NEXT: s_mul_i32 s8, s4, s13
+; VI-NEXT: s_addc_u32 s11, 0, s8
+; VI-NEXT: s_mul_i32 s8, s4, s11
; VI-NEXT: v_readfirstlane_b32 s9, v1
; VI-NEXT: s_add_i32 s8, s9, s8
-; VI-NEXT: s_mul_i32 s9, s5, s12
-; VI-NEXT: s_add_i32 s14, s8, s9
-; VI-NEXT: s_sub_i32 s10, s3, s14
+; VI-NEXT: s_mul_i32 s9, s5, s10
+; VI-NEXT: s_add_i32 s12, s8, s9
+; VI-NEXT: s_sub_i32 s13, s3, s12
; VI-NEXT: v_readfirstlane_b32 s8, v0
-; VI-NEXT: s_sub_u32 s15, s2, s8
+; VI-NEXT: s_sub_u32 s14, s2, s8
; VI-NEXT: s_cselect_b64 s[8:9], -1, 0
-; VI-NEXT: s_cmp_lg_u64 s[8:9], 0
-; VI-NEXT: s_subb_u32 s16, s10, s5
-; VI-NEXT: s_sub_u32 s17, s15, s4
-; VI-NEXT: s_cselect_b64 s[10:11], -1, 0
-; VI-NEXT: s_cmp_lg_u64 s[10:11], 0
-; VI-NEXT: s_subb_u32 s10, s16, 0
-; VI-NEXT: s_cmp_ge_u32 s10, s5
-; VI-NEXT: s_cselect_b32 s11, -1, 0
-; VI-NEXT: s_cmp_ge_u32 s17, s4
+; VI-NEXT: s_subb_u32 s13, s13, s5
+; VI-NEXT: s_sub_u32 s15, s14, s4
+; VI-NEXT: s_subb_u32 s13, s13, 0
+; VI-NEXT: s_cmp_ge_u32 s13, s5
; VI-NEXT: s_cselect_b32 s16, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s10, s5
-; VI-NEXT: s_cselect_b32 s10, s16, s11
-; VI-NEXT: s_add_u32 s11, s12, 1
-; VI-NEXT: s_addc_u32 s16, s13, 0
-; VI-NEXT: s_add_u32 s17, s12, 2
-; VI-NEXT: s_addc_u32 s18, s13, 0
-; VI-NEXT: s_cmp_lg_u32 s10, 0
-; VI-NEXT: s_cselect_b32 s10, s17, s11
-; VI-NEXT: s_cselect_b32 s11, s18, s16
+; VI-NEXT: s_cmp_ge_u32 s15, s4
+; VI-NEXT: s_cselect_b32 s15, -1, 0
+; VI-NEXT: s_cmp_eq_u32 s13, s5
+; VI-NEXT: s_cselect_b32 s13, s15, s16
+; VI-NEXT: s_add_u32 s15, s10, 1
+; VI-NEXT: s_addc_u32 s16, s11, 0
+; VI-NEXT: s_add_u32 s17, s10, 2
+; VI-NEXT: s_addc_u32 s18, s11, 0
+; VI-NEXT: s_cmp_lg_u32 s13, 0
+; VI-NEXT: s_cselect_b32 s13, s17, s15
+; VI-NEXT: s_cselect_b32 s15, s18, s16
; VI-NEXT: s_cmp_lg_u64 s[8:9], 0
-; VI-NEXT: s_subb_u32 s3, s3, s14
+; VI-NEXT: s_subb_u32 s3, s3, s12
; VI-NEXT: s_cmp_ge_u32 s3, s5
; VI-NEXT: s_cselect_b32 s8, -1, 0
-; VI-NEXT: s_cmp_ge_u32 s15, s4
+; VI-NEXT: s_cmp_ge_u32 s14, s4
; VI-NEXT: s_cselect_b32 s9, -1, 0
; VI-NEXT: s_cmp_eq_u32 s3, s5
; VI-NEXT: s_cselect_b32 s3, s9, s8
; VI-NEXT: s_cmp_lg_u32 s3, 0
-; VI-NEXT: s_cselect_b32 s9, s11, s13
-; VI-NEXT: s_cselect_b32 s8, s10, s12
+; VI-NEXT: s_cselect_b32 s9, s15, s11
+; VI-NEXT: s_cselect_b32 s8, s13, s10
; VI-NEXT: s_cbranch_execnz .LBB16_4
; VI-NEXT: .LBB16_2:
; VI-NEXT: v_cvt_f32_u32_e32 v0, s4
@@ -2311,8 +2278,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT: s_sub_u32 s10, 0, s6
-; GFX9-NEXT: s_subb_u32 s11, 0, s7
+; GFX9-NEXT: s_sub_u32 s8, 0, s6
+; GFX9-NEXT: s_subb_u32 s9, 0, s7
; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX9-NEXT: v_rcp_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2321,109 +2288,102 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX9-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s12, v1
-; GFX9-NEXT: v_readfirstlane_b32 s8, v0
-; GFX9-NEXT: s_mul_i32 s9, s10, s12
-; GFX9-NEXT: s_mul_hi_u32 s14, s10, s8
-; GFX9-NEXT: s_mul_i32 s13, s11, s8
-; GFX9-NEXT: s_add_i32 s9, s14, s9
-; GFX9-NEXT: s_add_i32 s9, s9, s13
-; GFX9-NEXT: s_mul_i32 s15, s10, s8
-; GFX9-NEXT: s_mul_i32 s14, s8, s9
-; GFX9-NEXT: s_mul_hi_u32 s16, s8, s15
-; GFX9-NEXT: s_mul_hi_u32 s13, s8, s9
+; GFX9-NEXT: v_readfirstlane_b32 s10, v1
+; GFX9-NEXT: v_readfirstlane_b32 s11, v0
+; GFX9-NEXT: s_mul_i32 s12, s8, s10
+; GFX9-NEXT: s_mul_hi_u32 s14, s8, s11
+; GFX9-NEXT: s_mul_i32 s13, s9, s11
+; GFX9-NEXT: s_add_i32 s12, s14, s12
+; GFX9-NEXT: s_add_i32 s12, s12, s13
+; GFX9-NEXT: s_mul_i32 s15, s8, s11
+; GFX9-NEXT: s_mul_i32 s14, s11, s12
+; GFX9-NEXT: s_mul_hi_u32 s16, s11, s15
+; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12
; GFX9-NEXT: s_add_u32 s14, s16, s14
; GFX9-NEXT: s_addc_u32 s13, 0, s13
-; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15
-; GFX9-NEXT: s_mul_i32 s15, s12, s15
+; GFX9-NEXT: s_mul_hi_u32 s17, s10, s15
+; GFX9-NEXT: s_mul_i32 s15, s10, s15
; GFX9-NEXT: s_add_u32 s14, s14, s15
-; GFX9-NEXT: s_mul_hi_u32 s16, s12, s9
+; GFX9-NEXT: s_mul_hi_u32 s16, s10, s12
; GFX9-NEXT: s_addc_u32 s13, s13, s17
; GFX9-NEXT: s_addc_u32 s14, s16, 0
-; GFX9-NEXT: s_mul_i32 s9, s12, s9
-; GFX9-NEXT: s_add_u32 s9, s13, s9
+; GFX9-NEXT: s_mul_i32 s12, s10, s12
+; GFX9-NEXT: s_add_u32 s12, s13, s12
; GFX9-NEXT: s_addc_u32 s13, 0, s14
-; GFX9-NEXT: s_add_u32 s14, s8, s9
-; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT: s_addc_u32 s12, s12, s13
-; GFX9-NEXT: s_mul_i32 s8, s10, s12
-; GFX9-NEXT: s_mul_hi_u32 s9, s10, s14
-; GFX9-NEXT: s_add_i32 s8, s9, s8
-; GFX9-NEXT: s_mul_i32 s11, s11, s14
-; GFX9-NEXT: s_add_i32 s8, s8, s11
-; GFX9-NEXT: s_mul_i32 s10, s10, s14
-; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10
-; GFX9-NEXT: s_mul_i32 s13, s12, s10
-; GFX9-NEXT: s_mul_i32 s16, s14, s8
-; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10
-; GFX9-NEXT: s_mul_hi_u32 s15, s14, s8
-; GFX9-NEXT: s_add_u32 s10, s10, s16
+; GFX9-NEXT: s_add_u32 s11, s11, s12
+; GFX9-NEXT: s_addc_u32 s10, s10, s13
+; GFX9-NEXT: s_mul_i32 s12, s8, s10
+; GFX9-NEXT: s_mul_hi_u32 s13, s8, s11
+; GFX9-NEXT: s_add_i32 s12, s13, s12
+; GFX9-NEXT: s_mul_i32 s9, s9, s11
+; GFX9-NEXT: s_add_i32 s12, s12, s9
+; GFX9-NEXT: s_mul_i32 s8, s8, s11
+; GFX9-NEXT: s_mul_hi_u32 s13, s10, s8
+; GFX9-NEXT: s_mul_i32 s14, s10, s8
+; GFX9-NEXT: s_mul_i32 s16, s11, s12
+; GFX9-NEXT: s_mul_hi_u32 s8, s11, s8
+; GFX9-NEXT: s_mul_hi_u32 s15, s11, s12
+; GFX9-NEXT: s_add_u32 s8, s8, s16
; GFX9-NEXT: s_addc_u32 s15, 0, s15
-; GFX9-NEXT: s_add_u32 s10, s10, s13
-; GFX9-NEXT: s_mul_hi_u32 s9, s12, s8
-; GFX9-NEXT: s_addc_u32 s10, s15, s11
+; GFX9-NEXT: s_add_u32 s8, s8, s14
+; GFX9-NEXT: s_mul_hi_u32 s9, s10, s12
+; GFX9-NEXT: s_addc_u32 s8, s15, s13
; GFX9-NEXT: s_addc_u32 s9, s9, 0
-; GFX9-NEXT: s_mul_i32 s8, s12, s8
-; GFX9-NEXT: s_add_u32 s8, s10, s8
-; GFX9-NEXT: s_addc_u32 s10, 0, s9
-; GFX9-NEXT: s_add_u32 s11, s14, s8
-; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT: s_addc_u32 s8, s12, s10
-; GFX9-NEXT: s_mul_i32 s10, s2, s8
-; GFX9-NEXT: s_mul_hi_u32 s12, s2, s11
-; GFX9-NEXT: s_mul_hi_u32 s9, s2, s8
-; GFX9-NEXT: s_add_u32 s10, s12, s10
+; GFX9-NEXT: s_mul_i32 s12, s10, s12
+; GFX9-NEXT: s_add_u32 s8, s8, s12
; GFX9-NEXT: s_addc_u32 s9, 0, s9
-; GFX9-NEXT: s_mul_hi_u32 s13, s3, s11
-; GFX9-NEXT: s_mul_i32 s11, s3, s11
-; GFX9-NEXT: s_add_u32 s10, s10, s11
-; GFX9-NEXT: s_mul_hi_u32 s12, s3, s8
-; GFX9-NEXT: s_addc_u32 s9, s9, s13
-; GFX9-NEXT: s_addc_u32 s10, s12, 0
+; GFX9-NEXT: s_add_u32 s8, s11, s8
+; GFX9-NEXT: s_addc_u32 s9, s10, s9
+; GFX9-NEXT: s_mul_i32 s11, s2, s9
+; GFX9-NEXT: s_mul_hi_u32 s12, s2, s8
+; GFX9-NEXT: s_mul_hi_u32 s10, s2, s9
+; GFX9-NEXT: s_add_u32 s11, s12, s11
+; GFX9-NEXT: s_addc_u32 s10, 0, s10
+; GFX9-NEXT: s_mul_hi_u32 s13, s3, s8
; GFX9-NEXT: s_mul_i32 s8, s3, s8
-; GFX9-NEXT: s_add_u32 s12, s9, s8
-; GFX9-NEXT: s_addc_u32 s13, 0, s10
-; GFX9-NEXT: s_mul_i32 s8, s6, s13
-; GFX9-NEXT: s_mul_hi_u32 s9, s6, s12
+; GFX9-NEXT: s_add_u32 s8, s11, s8
+; GFX9-NEXT: s_mul_hi_u32 s12, s3, s9
+; GFX9-NEXT: s_addc_u32 s8, s10, s13
+; GFX9-NEXT: s_addc_u32 s10, s12, 0
+; GFX9-NEXT: s_mul_i32 s9, s3, s9
+; GFX9-NEXT: s_add_u32 s11, s8, s9
+; GFX9-NEXT: s_addc_u32 s10, 0, s10
+; GFX9-NEXT: s_mul_i32 s8, s6, s10
+; GFX9-NEXT: s_mul_hi_u32 s9, s6, s11
; GFX9-NEXT: s_add_i32 s8, s9, s8
-; GFX9-NEXT: s_mul_i32 s9, s7, s12
-; GFX9-NEXT: s_add_i32 s14, s8, s9
-; GFX9-NEXT: s_sub_i32 s10, s3, s14
-; GFX9-NEXT: s_mul_i32 s8, s6, s12
-; GFX9-NEXT: s_sub_u32 s15, s2, s8
+; GFX9-NEXT: s_mul_i32 s9, s7, s11
+; GFX9-NEXT: s_add_i32 s12, s8, s9
+; GFX9-NEXT: s_sub_i32 s13, s3, s12
+; GFX9-NEXT: s_mul_i32 s8, s6, s11
+; GFX9-NEXT: s_sub_u32 s14, s2, s8
; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT: s_subb_u32 s16, s10, s7
-; GFX9-NEXT: s_sub_u32 s17, s15, s6
-; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s10, s16, 0
-; GFX9-NEXT: s_cmp_ge_u32 s10, s7
-; GFX9-NEXT: s_cselect_b32 s11, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s17, s6
+; GFX9-NEXT: s_subb_u32 s13, s13, s7
+; GFX9-NEXT: s_sub_u32 s15, s14, s6
+; GFX9-NEXT: s_subb_u32 s13, s13, 0
+; GFX9-NEXT: s_cmp_ge_u32 s13, s7
; GFX9-NEXT: s_cselect_b32 s16, -1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s10, s7
-; GFX9-NEXT: s_cselect_b32 s10, s16, s11
-; GFX9-NEXT: s_add_u32 s11, s12, 1
-; GFX9-NEXT: s_addc_u32 s16, s13, 0
-; GFX9-NEXT: s_add_u32 s17, s12, 2
-; GFX9-NEXT: s_addc_u32 s18, s13, 0
-; GFX9-NEXT: s_cmp_lg_u32 s10, 0
-; GFX9-NEXT: s_cselect_b32 s10, s17, s11
-; GFX9-NEXT: s_cselect_b32 s11, s18, s16
+; GFX9-NEXT: s_cmp_ge_u32 s15, s6
+; GFX9-NEXT: s_cselect_b32 s15, -1, 0
+; GFX9-NEXT: s_cmp_eq_u32 s13, s7
+; GFX9-NEXT: s_cselect_b32 s13, s15, s16
+; GFX9-NEXT: s_add_u32 s15, s11, 1
+; GFX9-NEXT: s_addc_u32 s16, s10, 0
+; GFX9-NEXT: s_add_u32 s17, s11, 2
+; GFX9-NEXT: s_addc_u32 s18, s10, 0
+; GFX9-NEXT: s_cmp_lg_u32 s13, 0
+; GFX9-NEXT: s_cselect_b32 s13, s17, s15
+; GFX9-NEXT: s_cselect_b32 s15, s18, s16
; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT: s_subb_u32 s3, s3, s14
+; GFX9-NEXT: s_subb_u32 s3, s3, s12
; GFX9-NEXT: s_cmp_ge_u32 s3, s7
; GFX9-NEXT: s_cselect_b32 s8, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s15, s6
+; GFX9-NEXT: s_cmp_ge_u32 s14, s6
; GFX9-NEXT: s_cselect_b32 s9, -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s3, s7
; GFX9-NEXT: s_cselect_b32 s3, s9, s8
; GFX9-NEXT: s_cmp_lg_u32 s3, 0
-; GFX9-NEXT: s_cselect_b32 s9, s11, s13
-; GFX9-NEXT: s_cselect_b32 s8, s10, s12
+; GFX9-NEXT: s_cselect_b32 s9, s15, s10
+; GFX9-NEXT: s_cselect_b32 s8, s13, s11
; GFX9-NEXT: s_cbranch_execnz .LBB16_3
; GFX9-NEXT: .LBB16_2:
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
@@ -2503,44 +2463,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1010-NEXT: s_add_u32 s11, s12, s11
; GFX1010-NEXT: s_addc_u32 s12, 0, s13
; GFX1010-NEXT: s_add_u32 s8, s8, s11
-; GFX1010-NEXT: s_cselect_b32 s11, -1, 0
-; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s8
-; GFX1010-NEXT: s_cmp_lg_u32 s11, 0
-; GFX1010-NEXT: s_mul_i32 s11, s9, s8
; GFX1010-NEXT: s_addc_u32 s5, s5, s12
-; GFX1010-NEXT: s_mul_i32 s10, s10, s8
+; GFX1010-NEXT: s_mul_hi_u32 s11, s9, s8
+; GFX1010-NEXT: s_mul_i32 s12, s9, s8
; GFX1010-NEXT: s_mul_i32 s9, s9, s5
-; GFX1010-NEXT: s_mul_hi_u32 s12, s8, s11
-; GFX1010-NEXT: s_add_i32 s9, s13, s9
-; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s11
+; GFX1010-NEXT: s_mul_i32 s10, s10, s8
+; GFX1010-NEXT: s_add_i32 s9, s11, s9
+; GFX1010-NEXT: s_mul_i32 s11, s5, s12
; GFX1010-NEXT: s_add_i32 s9, s9, s10
-; GFX1010-NEXT: s_mul_i32 s10, s5, s11
+; GFX1010-NEXT: s_mul_hi_u32 s10, s8, s12
; GFX1010-NEXT: s_mul_i32 s15, s8, s9
; GFX1010-NEXT: s_mul_hi_u32 s14, s8, s9
-; GFX1010-NEXT: s_add_u32 s12, s12, s15
+; GFX1010-NEXT: s_add_u32 s10, s10, s15
+; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s12
; GFX1010-NEXT: s_addc_u32 s14, 0, s14
-; GFX1010-NEXT: s_mul_hi_u32 s11, s5, s9
-; GFX1010-NEXT: s_add_u32 s10, s12, s10
+; GFX1010-NEXT: s_mul_hi_u32 s12, s5, s9
+; GFX1010-NEXT: s_add_u32 s10, s10, s11
; GFX1010-NEXT: s_mul_i32 s9, s5, s9
; GFX1010-NEXT: s_addc_u32 s10, s14, s13
-; GFX1010-NEXT: s_addc_u32 s11, s11, 0
+; GFX1010-NEXT: s_addc_u32 s11, s12, 0
; GFX1010-NEXT: s_add_u32 s9, s10, s9
; GFX1010-NEXT: s_addc_u32 s10, 0, s11
; GFX1010-NEXT: s_add_u32 s8, s8, s9
-; GFX1010-NEXT: s_cselect_b32 s9, -1, 0
-; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s8
-; GFX1010-NEXT: s_cmp_lg_u32 s9, 0
-; GFX1010-NEXT: s_mul_hi_u32 s9, s3, s8
; GFX1010-NEXT: s_addc_u32 s5, s5, s10
-; GFX1010-NEXT: s_mul_i32 s8, s3, s8
+; GFX1010-NEXT: s_mul_hi_u32 s9, s2, s8
; GFX1010-NEXT: s_mul_i32 s12, s2, s5
-; GFX1010-NEXT: s_mul_hi_u32 s10, s2, s5
-; GFX1010-NEXT: s_add_u32 s11, s11, s12
-; GFX1010-NEXT: s_addc_u32 s10, 0, s10
+; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s5
+; GFX1010-NEXT: s_mul_hi_u32 s10, s3, s8
+; GFX1010-NEXT: s_mul_i32 s8, s3, s8
+; GFX1010-NEXT: s_add_u32 s9, s9, s12
+; GFX1010-NEXT: s_addc_u32 s11, 0, s11
; GFX1010-NEXT: s_mul_hi_u32 s13, s3, s5
-; GFX1010-NEXT: s_add_u32 s8, s11, s8
+; GFX1010-NEXT: s_add_u32 s8, s9, s8
; GFX1010-NEXT: s_mul_i32 s5, s3, s5
-; GFX1010-NEXT: s_addc_u32 s8, s10, s9
+; GFX1010-NEXT: s_addc_u32 s8, s11, s10
; GFX1010-NEXT: s_addc_u32 s9, s13, 0
; GFX1010-NEXT: s_add_u32 s5, s8, s5
; GFX1010-NEXT: s_addc_u32 s8, 0, s9
@@ -2553,11 +2509,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1010-NEXT: s_sub_i32 s11, s3, s9
; GFX1010-NEXT: s_sub_u32 s10, s2, s10
; GFX1010-NEXT: s_cselect_b32 s12, -1, 0
-; GFX1010-NEXT: s_cmp_lg_u32 s12, 0
; GFX1010-NEXT: s_subb_u32 s11, s11, s7
; GFX1010-NEXT: s_sub_u32 s13, s10, s6
-; GFX1010-NEXT: s_cselect_b32 s14, -1, 0
-; GFX1010-NEXT: s_cmp_lg_u32 s14, 0
; GFX1010-NEXT: s_subb_u32 s11, s11, 0
; GFX1010-NEXT: s_cmp_ge_u32 s11, s7
; GFX1010-NEXT: s_cselect_b32 s14, -1, 0
@@ -2663,44 +2616,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: s_add_u32 s11, s12, s11
; GFX1030W32-NEXT: s_addc_u32 s12, 0, s13
; GFX1030W32-NEXT: s_add_u32 s8, s8, s11
-; GFX1030W32-NEXT: s_cselect_b32 s11, -1, 0
-; GFX1030W32-NEXT: s_mul_hi_u32 s13, s9, s8
-; GFX1030W32-NEXT: s_cmp_lg_u32 s11, 0
-; GFX1030W32-NEXT: s_mul_i32 s11, s9, s8
; GFX1030W32-NEXT: s_addc_u32 s7, s7, s12
-; GFX1030W32-NEXT: s_mul_i32 s10, s10, s8
+; GFX1030W32-NEXT: s_mul_hi_u32 s11, s9, s8
+; GFX1030W32-NEXT: s_mul_i32 s12, s9, s8
; GFX1030W32-NEXT: s_mul_i32 s9, s9, s7
-; GFX1030W32-NEXT: s_mul_hi_u32 s12, s8, s11
-; GFX1030W32-NEXT: s_add_i32 s9, s13, s9
-; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s11
+; GFX1030W32-NEXT: s_mul_i32 s10, s10, s8
+; GFX1030W32-NEXT: s_add_i32 s9, s11, s9
+; GFX1030W32-NEXT: s_mul_i32 s11, s7, s12
; GFX1030W32-NEXT: s_add_i32 s9, s9, s10
-; GFX1030W32-NEXT: s_mul_i32 s10, s7, s11
+; GFX1030W32-NEXT: s_mul_hi_u32 s10, s8, s12
; GFX1030W32-NEXT: s_mul_i32 s15, s8, s9
; GFX1030W32-NEXT: s_mul_hi_u32 s14, s8, s9
-; GFX1030W32-NEXT: s_add_u32 s12, s12, s15
+; GFX1030W32-NEXT: s_add_u32 s10, s10, s15
+; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s12
; GFX1030W32-NEXT: s_addc_u32 s14, 0, s14
-; GFX1030W32-NEXT: s_mul_hi_u32 s11, s7, s9
-; GFX1030W32-NEXT: s_add_u32 s10, s12, s10
+; GFX1030W32-NEXT: s_mul_hi_u32 s12, s7, s9
+; GFX1030W32-NEXT: s_add_u32 s10, s10, s11
; GFX1030W32-NEXT: s_mul_i32 s9, s7, s9
; GFX1030W32-NEXT: s_addc_u32 s10, s14, s13
-; GFX1030W32-NEXT: s_addc_u32 s11, s11, 0
+; GFX1030W32-NEXT: s_addc_u32 s11, s12, 0
; GFX1030W32-NEXT: s_add_u32 s9, s10, s9
; GFX1030W32-NEXT: s_addc_u32 s10, 0, s11
; GFX1030W32-NEXT: s_add_u32 s8, s8, s9
-; GFX1030W32-NEXT: s_cselect_b32 s9, -1, 0
-; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s8
-; GFX1030W32-NEXT: s_cmp_lg_u32 s9, 0
-; GFX1030W32-NEXT: s_mul_hi_u32 s9, s3, s8
; GFX1030W32-NEXT: s_addc_u32 s7, s7, s10
-; GFX1030W32-NEXT: s_mul_i32 s8, s3, s8
+; GFX1030W32-NEXT: s_mul_hi_u32 s9, s2, s8
; GFX1030W32-NEXT: s_mul_i32 s12, s2, s7
-; GFX1030W32-NEXT: s_mul_hi_u32 s10, s2, s7
-; GFX1030W32-NEXT: s_add_u32 s11, s11, s12
-; GFX1030W32-NEXT: s_addc_u32 s10, 0, s10
+; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s7
+; GFX1030W32-NEXT: s_mul_hi_u32 s10, s3, s8
+; GFX1030W32-NEXT: s_mul_i32 s8, s3, s8
+; GFX1030W32-NEXT: s_add_u32 s9, s9, s12
+; GFX1030W32-NEXT: s_addc_u32 s11, 0, s11
; GFX1030W32-NEXT: s_mul_hi_u32 s13, s3, s7
-; GFX1030W32-NEXT: s_add_u32 s8, s11, s8
+; GFX1030W32-NEXT: s_add_u32 s8, s9, s8
; GFX1030W32-NEXT: s_mul_i32 s7, s3, s7
-; GFX1030W32-NEXT: s_addc_u32 s8, s10, s9
+; GFX1030W32-NEXT: s_addc_u32 s8, s11, s10
; GFX1030W32-NEXT: s_addc_u32 s9, s13, 0
; GFX1030W32-NEXT: s_add_u32 s7, s8, s7
; GFX1030W32-NEXT: s_addc_u32 s8, 0, s9
@@ -2713,11 +2662,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: s_sub_i32 s11, s3, s9
; GFX1030W32-NEXT: s_sub_u32 s10, s2, s10
; GFX1030W32-NEXT: s_cselect_b32 s12, -1, 0
-; GFX1030W32-NEXT: s_cmp_lg_u32 s12, 0
; GFX1030W32-NEXT: s_subb_u32 s11, s11, s5
; GFX1030W32-NEXT: s_sub_u32 s13, s10, s4
-; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0
-; GFX1030W32-NEXT: s_cmp_lg_u32 s14, 0
; GFX1030W32-NEXT: s_subb_u32 s11, s11, 0
; GFX1030W32-NEXT: s_cmp_ge_u32 s11, s5
; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0
@@ -2790,8 +2736,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W64-NEXT: ; %bb.1:
; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4
; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v1, s5
-; GFX1030W64-NEXT: s_sub_u32 s9, 0, s4
-; GFX1030W64-NEXT: s_subb_u32 s10, 0, s5
+; GFX1030W64-NEXT: s_sub_u32 s8, 0, s4
+; GFX1030W64-NEXT: s_subb_u32 s9, 0, s5
; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0
; GFX1030W64-NEXT: v_rcp_f32_e32 v0, v0
; GFX1030W64-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2800,109 +2746,102 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0
; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX1030W64-NEXT: v_readfirstlane_b32 s8, v1
-; GFX1030W64-NEXT: v_readfirstlane_b32 s6, v0
-; GFX1030W64-NEXT: s_mul_i32 s7, s9, s8
-; GFX1030W64-NEXT: s_mul_hi_u32 s12, s9, s6
-; GFX1030W64-NEXT: s_mul_i32 s11, s10, s6
-; GFX1030W64-NEXT: s_add_i32 s7, s12, s7
-; GFX1030W64-NEXT: s_mul_i32 s13, s9, s6
-; GFX1030W64-NEXT: s_add_i32 s7, s7, s11
-; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s13
-; GFX1030W64-NEXT: s_mul_i32 s15, s6, s7
-; GFX1030W64-NEXT: s_mul_hi_u32 s14, s8, s13
-; GFX1030W64-NEXT: s_mul_i32 s11, s8, s13
-; GFX1030W64-NEXT: s_mul_hi_u32 s13, s6, s7
+; GFX1030W64-NEXT: v_readfirstlane_b32 s6, v1
+; GFX1030W64-NEXT: v_readfirstlane_b32 s7, v0
+; GFX1030W64-NEXT: s_mul_i32 s10, s8, s6
+; GFX1030W64-NEXT: s_mul_hi_u32 s12, s8, s7
+; GFX1030W64-NEXT: s_mul_i32 s11, s9, s7
+; GFX1030W64-NEXT: s_add_i32 s10, s12, s10
+; GFX1030W64-NEXT: s_mul_i32 s13, s8, s7
+; GFX1030W64-NEXT: s_add_i32 s10, s10, s11
+; GFX1030W64-NEXT: s_mul_hi_u32 s12, s7, s13
+; GFX1030W64-NEXT: s_mul_i32 s15, s7, s10
+; GFX1030W64-NEXT: s_mul_hi_u32 s14, s6, s13
+; GFX1030W64-NEXT: s_mul_i32 s11, s6, s13
+; GFX1030W64-NEXT: s_mul_hi_u32 s13, s7, s10
; GFX1030W64-NEXT: s_add_u32 s12, s12, s15
; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13
-; GFX1030W64-NEXT: s_mul_hi_u32 s16, s8, s7
+; GFX1030W64-NEXT: s_mul_hi_u32 s16, s6, s10
; GFX1030W64-NEXT: s_add_u32 s11, s12, s11
-; GFX1030W64-NEXT: s_mul_i32 s7, s8, s7
+; GFX1030W64-NEXT: s_mul_i32 s10, s6, s10
; GFX1030W64-NEXT: s_addc_u32 s11, s13, s14
; GFX1030W64-NEXT: s_addc_u32 s12, s16, 0
-; GFX1030W64-NEXT: s_add_u32 s7, s11, s7
+; GFX1030W64-NEXT: s_add_u32 s10, s11, s10
; GFX1030W64-NEXT: s_addc_u32 s11, 0, s12
-; GFX1030W64-NEXT: s_add_u32 s12, s6, s7
-; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX1030W64-NEXT: s_mul_hi_u32 s13, s9, s12
-; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT: s_mul_i32 s6, s9, s12
-; GFX1030W64-NEXT: s_addc_u32 s8, s8, s11
-; GFX1030W64-NEXT: s_mul_i32 s10, s10, s12
-; GFX1030W64-NEXT: s_mul_i32 s9, s9, s8
-; GFX1030W64-NEXT: s_mul_hi_u32 s7, s12, s6
-; GFX1030W64-NEXT: s_add_i32 s9, s13, s9
-; GFX1030W64-NEXT: s_mul_hi_u32 s11, s8, s6
-; GFX1030W64-NEXT: s_add_i32 s9, s9, s10
-; GFX1030W64-NEXT: s_mul_i32 s6, s8, s6
-; GFX1030W64-NEXT: s_mul_i32 s14, s12, s9
-; GFX1030W64-NEXT: s_mul_hi_u32 s13, s12, s9
-; GFX1030W64-NEXT: s_add_u32 s7, s7, s14
+; GFX1030W64-NEXT: s_add_u32 s7, s7, s10
+; GFX1030W64-NEXT: s_addc_u32 s6, s6, s11
+; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s7
+; GFX1030W64-NEXT: s_mul_i32 s11, s8, s7
+; GFX1030W64-NEXT: s_mul_i32 s8, s8, s6
+; GFX1030W64-NEXT: s_mul_i32 s9, s9, s7
+; GFX1030W64-NEXT: s_add_i32 s8, s10, s8
+; GFX1030W64-NEXT: s_mul_i32 s10, s6, s11
+; GFX1030W64-NEXT: s_add_i32 s8, s8, s9
+; GFX1030W64-NEXT: s_mul_hi_u32 s9, s7, s11
+; GFX1030W64-NEXT: s_mul_i32 s14, s7, s8
+; GFX1030W64-NEXT: s_mul_hi_u32 s13, s7, s8
+; GFX1030W64-NEXT: s_add_u32 s9, s9, s14
+; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s11
; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13
-; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s9
-; GFX1030W64-NEXT: s_add_u32 s6, s7, s6
-; GFX1030W64-NEXT: s_mul_i32 s9, s8, s9
-; GFX1030W64-NEXT: s_addc_u32 s6, s13, s11
-; GFX1030W64-NEXT: s_addc_u32 s7, s10, 0
-; GFX1030W64-NEXT: s_add_u32 s6, s6, s9
-; GFX1030W64-NEXT: s_addc_u32 s9, 0, s7
-; GFX1030W64-NEXT: s_add_u32 s10, s12, s6
-; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX1030W64-NEXT: s_mul_hi_u32 s11, s2, s10
-; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT: s_mul_hi_u32 s6, s3, s10
-; GFX1030W64-NEXT: s_addc_u32 s7, s8, s9
-; GFX1030W64-NEXT: s_mul_i32 s8, s3, s10
-; GFX1030W64-NEXT: s_mul_i32 s10, s2, s7
-; GFX1030W64-NEXT: s_mul_hi_u32 s9, s2, s7
-; GFX1030W64-NEXT: s_add_u32 s10, s11, s10
-; GFX1030W64-NEXT: s_addc_u32 s9, 0, s9
-; GFX1030W64-NEXT: s_mul_hi_u32 s12, s3, s7
-; GFX1030W64-NEXT: s_add_u32 s8, s10, s8
+; GFX1030W64-NEXT: s_mul_hi_u32 s11, s6, s8
+; GFX1030W64-NEXT: s_add_u32 s9, s9, s10
+; GFX1030W64-NEXT: s_mul_i32 s8, s6, s8
+; GFX1030W64-NEXT: s_addc_u32 s9, s13, s12
+; GFX1030W64-NEXT: s_addc_u32 s10, s11, 0
+; GFX1030W64-NEXT: s_add_u32 s8, s9, s8
+; GFX1030W64-NEXT: s_addc_u32 s9, 0, s10
+; GFX1030W64-NEXT: s_add_u32 s7, s7, s8
+; GFX1030W64-NEXT: s_addc_u32 s6, s6, s9
+; GFX1030W64-NEXT: s_mul_hi_u32 s8, s2, s7
+; GFX1030W64-NEXT: s_mul_i32 s11, s2, s6
+; GFX1030W64-NEXT: s_mul_hi_u32 s10, s2, s6
+; GFX1030W64-NEXT: s_mul_hi_u32 s9, s3, s7
; GFX1030W64-NEXT: s_mul_i32 s7, s3, s7
-; GFX1030W64-NEXT: s_addc_u32 s6, s9, s6
+; GFX1030W64-NEXT: s_add_u32 s8, s8, s11
+; GFX1030W64-NEXT: s_addc_u32 s10, 0, s10
+; GFX1030W64-NEXT: s_mul_hi_u32 s12, s3, s6
+; GFX1030W64-NEXT: s_add_u32 s7, s8, s7
+; GFX1030W64-NEXT: s_mul_i32 s6, s3, s6
+; GFX1030W64-NEXT: s_addc_u32 s7, s10, s9
; GFX1030W64-NEXT: s_addc_u32 s8, s12, 0
-; GFX1030W64-NEXT: s_add_u32 s10, s6, s7
+; GFX1030W64-NEXT: s_add_u32 s10, s7, s6
; GFX1030W64-NEXT: s_addc_u32 s11, 0, s8
; GFX1030W64-NEXT: s_mul_hi_u32 s6, s4, s10
; GFX1030W64-NEXT: s_mul_i32 s7, s4, s11
; GFX1030W64-NEXT: s_mul_i32 s8, s5, s10
; GFX1030W64-NEXT: s_add_i32 s6, s6, s7
-; GFX1030W64-NEXT: s_add_i32 s12, s6, s8
+; GFX1030W64-NEXT: s_add_i32 s8, s6, s8
; GFX1030W64-NEXT: s_mul_i32 s6, s4, s10
-; GFX1030W64-NEXT: s_sub_i32 s8, s3, s12
-; GFX1030W64-NEXT: s_sub_u32 s13, s2, s6
+; GFX1030W64-NEXT: s_sub_i32 s9, s3, s8
+; GFX1030W64-NEXT: s_sub_u32 s12, s2, s6
; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT: s_subb_u32 s14, s8, s5
-; GFX1030W64-NEXT: s_sub_u32 s15, s13, s4
-; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0
-; GFX1030W64-NEXT: s_subb_u32 s8, s14, 0
-; GFX1030W64-NEXT: s_cmp_ge_u32 s8, s5
-; GFX1030W64-NEXT: s_cselect_b32 s9, -1, 0
-; GFX1030W64-NEXT: s_cmp_ge_u32 s15, s4
+; GFX1030W64-NEXT: s_subb_u32 s9, s9, s5
+; GFX1030W64-NEXT: s_sub_u32 s13, s12, s4
+; GFX1030W64-NEXT: s_subb_u32 s9, s9, 0
+; GFX1030W64-NEXT: s_cmp_ge_u32 s9, s5
; GFX1030W64-NEXT: s_cselect_b32 s14, -1, 0
-; GFX1030W64-NEXT: s_cmp_eq_u32 s8, s5
-; GFX1030W64-NEXT: s_cselect_b32 s8, s14, s9
-; GFX1030W64-NEXT: s_add_u32 s9, s10, 1
+; GFX1030W64-NEXT: s_cmp_ge_u32 s13, s4
+; GFX1030W64-NEXT: s_cselect_b32 s13, -1, 0
+; GFX1030W64-NEXT: s_cmp_eq_u32 s9, s5
+; GFX1030W64-NEXT: s_cselect_b32 s9, s13, s14
+; GFX1030W64-NEXT: s_add_u32 s13, s10, 1
; GFX1030W64-NEXT: s_addc_u32 s14, s11, 0
; GFX1030W64-NEXT: s_add_u32 s15, s10, 2
; GFX1030W64-NEXT: s_addc_u32 s16, s11, 0
-; GFX1030W64-NEXT: s_cmp_lg_u32 s8, 0
-; GFX1030W64-NEXT: s_cselect_b32 s15, s15, s9
+; GFX1030W64-NEXT: s_cmp_lg_u32 s9, 0
+; GFX1030W64-NEXT: s_cselect_b32 s13, s15, s13
; GFX1030W64-NEXT: s_cselect_b32 s14, s16, s14
; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT: s_subb_u32 s3, s3, s12
+; GFX1030W64-NEXT: s_subb_u32 s3, s3, s8
; GFX1030W64-NEXT: s_cmp_ge_u32 s3, s5
; GFX1030W64-NEXT: s_cselect_b32 s6, -1, 0
-; GFX1030W64-NEXT: s_cmp_ge_u32 s13, s4
+; GFX1030W64-NEXT: s_cmp_ge_u32 s12, s4
; GFX1030W64-NEXT: s_cselect_b32 s7, -1, 0
; GFX1030W64-NEXT: s_cmp_eq_u32 s3, s5
; GFX1030W64-NEXT: s_cselect_b32 s3, s7, s6
; GFX1030W64-NEXT: s_cmp_lg_u32 s3, 0
; GFX1030W64-NEXT: s_cselect_b32 s7, s14, s11
-; GFX1030W64-NEXT: s_cselect_b32 s6, s15, s10
+; GFX1030W64-NEXT: s_cselect_b32 s6, s13, s10
; GFX1030W64-NEXT: s_cbranch_execnz .LBB16_3
; GFX1030W64-NEXT: .LBB16_2:
; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4
@@ -2988,44 +2927,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: s_add_u32 s11, s12, s11
; GFX11-NEXT: s_addc_u32 s12, 0, s13
; GFX11-NEXT: s_add_u32 s8, s8, s11
-; GFX11-NEXT: s_cselect_b32 s11, -1, 0
-; GFX11-NEXT: s_mul_hi_u32 s13, s9, s8
-; GFX11-NEXT: s_cmp_lg_u32 s11, 0
-; GFX11-NEXT: s_mul_i32 s11, s9, s8
; GFX11-NEXT: s_addc_u32 s7, s7, s12
-; GFX11-NEXT: s_mul_i32 s10, s10, s8
+; GFX11-NEXT: s_mul_hi_u32 s11, s9, s8
+; GFX11-NEXT: s_mul_i32 s12, s9, s8
; GFX11-NEXT: s_mul_i32 s9, s9, s7
-; GFX11-NEXT: s_mul_hi_u32 s12, s8, s11
-; GFX11-NEXT: s_add_i32 s9, s13, s9
-; GFX11-NEXT: s_mul_hi_u32 s13, s7, s11
+; GFX11-NEXT: s_mul_i32 s10, s10, s8
+; GFX11-NEXT: s_add_i32 s9, s11, s9
+; GFX11-NEXT: s_mul_i32 s11, s7, s12
; GFX11-NEXT: s_add_i32 s9, s9, s10
-; GFX11-NEXT: s_mul_i32 s10, s7, s11
+; GFX11-NEXT: s_mul_hi_u32 s10, s8, s12
; GFX11-NEXT: s_mul_i32 s15, s8, s9
; GFX11-NEXT: s_mul_hi_u32 s14, s8, s9
-; GFX11-NEXT: s_add_u32 s12, s12, s15
+; GFX11-NEXT: s_add_u32 s10, s10, s15
+; GFX11-NEXT: s_mul_hi_u32 s13, s7, s12
; GFX11-NEXT: s_addc_u32 s14, 0, s14
-; GFX11-NEXT: s_mul_hi_u32 s11, s7, s9
-; GFX11-NEXT: s_add_u32 s10, s12, s10
+; GFX11-NEXT: s_mul_hi_u32 s12, s7, s9
+; GFX11-NEXT: s_add_u32 s10, s10, s11
; GFX11-NEXT: s_mul_i32 s9, s7, s9
; GFX11-NEXT: s_addc_u32 s10, s14, s13
-; GFX11-NEXT: s_addc_u32 s11, s11, 0
+; GFX11-NEXT: s_addc_u32 s11, s12, 0
; GFX11-NEXT: s_add_u32 s9, s10, s9
; GFX11-NEXT: s_addc_u32 s10, 0, s11
; GFX11-NEXT: s_add_u32 s8, s8, s9
-; GFX11-NEXT: s_cselect_b32 s9, -1, 0
-; GFX11-NEXT: s_mul_hi_u32 s11, s2, s8
-; GFX11-NEXT: s_cmp_lg_u32 s9, 0
-; GFX11-NEXT: s_mul_hi_u32 s9, s3, s8
; GFX11-NEXT: s_addc_u32 s7, s7, s10
-; GFX11-NEXT: s_mul_i32 s8, s3, s8
+; GFX11-NEXT: s_mul_hi_u32 s9, s2, s8
; GFX11-NEXT: s_mul_i32 s12, s2, s7
-; GFX11-NEXT: s_mul_hi_u32 s10, s2, s7
-; GFX11-NEXT: s_add_u32 s11, s11, s12
-; GFX11-NEXT: s_addc_u32 s10, 0, s10
+; GFX11-NEXT: s_mul_hi_u32 s11, s2, s7
+; GFX11-NEXT: s_mul_hi_u32 s10, s3, s8
+; GFX11-NEXT: s_mul_i32 s8, s3, s8
+; GFX11-NEXT: s_add_u32 s9, s9, s12
+; GFX11-NEXT: s_addc_u32 s11, 0, s11
; GFX11-NEXT: s_mul_hi_u32 s13, s3, s7
-; GFX11-NEXT: s_add_u32 s8, s11, s8
+; GFX11-NEXT: s_add_u32 s8, s9, s8
; GFX11-NEXT: s_mul_i32 s7, s3, s7
-; GFX11-NEXT: s_addc_u32 s8, s10, s9
+; GFX11-NEXT: s_addc_u32 s8, s11, s10
; GFX11-NEXT: s_addc_u32 s9, s13, 0
; GFX11-NEXT: s_add_u32 s7, s8, s7
; GFX11-NEXT: s_addc_u32 s8, 0, s9
@@ -3035,17 +2970,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: s_add_i32 s9, s9, s10
; GFX11-NEXT: s_mul_i32 s10, s4, s7
; GFX11-NEXT: s_add_i32 s9, s9, s11
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_sub_i32 s11, s3, s9
; GFX11-NEXT: s_sub_u32 s10, s2, s10
; GFX11-NEXT: s_cselect_b32 s12, -1, 0
-; GFX11-NEXT: s_cmp_lg_u32 s12, 0
; GFX11-NEXT: s_subb_u32 s11, s11, s5
; GFX11-NEXT: s_sub_u32 s13, s10, s4
-; GFX11-NEXT: s_cselect_b32 s14, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_lg_u32 s14, 0
; GFX11-NEXT: s_subb_u32 s11, s11, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_cmp_ge_u32 s11, s5
; GFX11-NEXT: s_cselect_b32 s14, -1, 0
; GFX11-NEXT: s_cmp_ge_u32 s13, s4
@@ -3118,9 +3050,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_or_b64 s[4:5], s[2:3], s[6:7]
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_and_b64 s[4:5], s[4:5], 0xffffffff00000000
-; GFX1250-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX1250-NEXT: s_cbranch_scc0 .LBB16_4
; GFX1250-NEXT: ; %bb.1:
; GFX1250-NEXT: s_cvt_f32_u32 s4, s6
@@ -3155,12 +3086,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[4:5], s[12:13]
; GFX1250-NEXT: s_add_co_u32 s8, s8, s12
-; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT: s_cmp_lg_u32 s4, 0
; GFX1250-NEXT: s_add_co_ci_u32 s9, s9, s13
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_mul_u64 s[10:11], s[10:11], s[8:9]
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_mul_hi_u32 s13, s8, s11
; GFX1250-NEXT: s_mul_i32 s12, s8, s11
; GFX1250-NEXT: s_mul_hi_u32 s4, s8, s10
@@ -3175,19 +3103,17 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_add_nc_u64 s[10:11], s[4:5], s[10:11]
; GFX1250-NEXT: s_add_co_u32 s8, s8, s10
-; GFX1250-NEXT: s_cselect_b32 s10, -1, 0
-; GFX1250-NEXT: s_mul_hi_u32 s4, s2, s8
-; GFX1250-NEXT: s_cmp_lg_u32 s10, 0
-; GFX1250-NEXT: s_mul_hi_u32 s12, s3, s8
; GFX1250-NEXT: s_add_co_ci_u32 s10, s9, s11
-; GFX1250-NEXT: s_mul_i32 s11, s3, s8
+; GFX1250-NEXT: s_mul_hi_u32 s4, s2, s8
+; GFX1250-NEXT: s_mul_hi_u32 s11, s3, s8
+; GFX1250-NEXT: s_mul_i32 s12, s3, s8
; GFX1250-NEXT: s_mul_hi_u32 s9, s2, s10
; GFX1250-NEXT: s_mul_i32 s8, s2, s10
; GFX1250-NEXT: s_mul_hi_u32 s13, s3, s10
; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[8:9]
; GFX1250-NEXT: s_mul_i32 s10, s3, s10
-; GFX1250-NEXT: s_add_co_u32 s4, s8, s11
-; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s12
+; GFX1250-NEXT: s_add_co_u32 s4, s8, s12
+; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s11
; GFX1250-NEXT: s_add_co_ci_u32 s11, s13, 0
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[10:11]
@@ -3202,10 +3128,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1250-NEXT: s_cmp_lg_u32 s8, 0
; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, s7
; GFX1250-NEXT: s_sub_co_u32 s13, s4, s6
-; GFX1250-NEXT: s_cselect_b32 s14, -1, 0
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT: s_cmp_lg_u32 s14, 0
; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, 0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_cmp_ge_u32 s12, s7
; GFX1250-NEXT: s_cselect_b32 s14, -1, 0
; GFX1250-NEXT: s_cmp_ge_u32 s13, s6
diff --git a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
index 57a1e4c..ec92edb 100644
--- a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
@@ -3385,7 +3385,7 @@ declare half @llvm.canonicalize.f16(half)
declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>)
attributes #0 = { nounwind "amdgpu-ieee"="false" }
-attributes #1 = { nounwind "unsafe-fp-math"="true" "no-nans-fp-math"="true" }
+attributes #1 = { nounwind "no-nans-fp-math"="true" }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX11NONANS-FAKE16: {{.*}}
; GFX11NONANS-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 4b151b9..07e6a76 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -714,9 +714,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; VI-NEXT: s_lshl_b32 s2, s2, 8
; VI-NEXT: s_or_b32 s2, s2, s3
; VI-NEXT: s_lshl_b32 s3, s2, 16
-; VI-NEXT: s_and_b32 s2, s2, 0xffff
; VI-NEXT: s_flbit_i32_b32 s3, s3
-; VI-NEXT: s_cmp_lg_u32 s2, 0
+; VI-NEXT: s_and_b32 s2, s2, 0xffff
; VI-NEXT: s_cselect_b32 s2, s3, 32
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index cefcbdd..fca57be 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -1491,7 +1491,6 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s4, s6, 16
-; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: s_cbranch_scc0 .LBB14_4
; SI-NEXT: ; %bb.1: ; %else
; SI-NEXT: s_mov_b32 s11, 0xf000
@@ -1521,7 +1520,6 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s4, s6, 16
-; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: s_cbranch_scc0 .LBB14_4
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: s_mov_b32 s11, 0xf000
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index d8a5e7fa..dbdea8e 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -14,7 +14,6 @@ define i32 @s_add_co_select_user() {
; GFX7-NEXT: s_add_u32 s7, s6, s6
; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX7-NEXT: s_or_b32 s4, s4, s5
-; GFX7-NEXT: s_cmp_lg_u32 s4, 0
; GFX7-NEXT: s_addc_u32 s8, s6, 0
; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec
@@ -31,8 +30,6 @@ define i32 @s_add_co_select_user() {
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_u32 s7, s6, s6
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-NEXT: s_addc_u32 s8, s6, 0
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
@@ -49,8 +46,6 @@ define i32 @s_add_co_select_user() {
; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s5, s4, s4
-; GFX10-NEXT: s_cselect_b32 s6, -1, 0
-; GFX10-NEXT: s_cmp_lg_u32 s6, 0
; GFX10-NEXT: s_addc_u32 s6, s4, 0
; GFX10-NEXT: s_cselect_b32 s7, -1, 0
; GFX10-NEXT: s_and_b32 s7, s7, exec_lo
@@ -67,16 +62,13 @@ define i32 @s_add_co_select_user() {
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s1, s0, s0
-; GFX11-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-NEXT: s_addc_u32 s2, s0, 0
; GFX11-NEXT: s_cselect_b32 s3, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s3, s3, exec_lo
; GFX11-NEXT: s_cselect_b32 s2, s2, 0
; GFX11-NEXT: s_cmp_gt_u32 s0, 31
; GFX11-NEXT: s_cselect_b32 s0, s1, s2
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
@@ -104,7 +96,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX7-NEXT: s_add_u32 s0, s2, s2
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX7-NEXT: s_or_b32 s0, s0, s1
-; GFX7-NEXT: s_cmp_lg_u32 s0, 0
; GFX7-NEXT: s_addc_u32 s0, s2, 0
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX7-NEXT: s_andn2_b64 vcc, exec, s[0:1]
@@ -125,12 +116,10 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
;
; GFX9-LABEL: s_add_co_br_user:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0
+; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s2, s2
-; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: s_addc_u32 s0, s2, 0
+; GFX9-NEXT: s_add_u32 s1, s0, s0
+; GFX9-NEXT: s_addc_u32 s0, s0, 0
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: s_andn2_b64 vcc, exec, s[0:1]
; GFX9-NEXT: s_cbranch_vccnz .LBB1_2
@@ -153,8 +142,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s1, s0, s0
-; GFX10-NEXT: s_cselect_b32 s1, -1, 0
-; GFX10-NEXT: s_cmp_lg_u32 s1, 0
; GFX10-NEXT: s_addc_u32 s0, s0, 0
; GFX10-NEXT: s_cselect_b32 s0, -1, 0
; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s0
@@ -178,11 +165,9 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s1, s0, s0
-; GFX11-NEXT: s_cselect_b32 s1, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-NEXT: s_addc_u32 s0, s0, 0
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_vccnz .LBB1_2
; GFX11-NEXT: ; %bb.1: ; %bb0
diff --git a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
index 13206ad..f45070c 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
-; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
+; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
+; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
-; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s
-; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s
+; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s
+; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s
; FIXME: This should also fold when fma is actually fast if an FMA
; exists in the original program.
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 62847b1..9a17538 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1117,7 +1117,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
; SI: ; %bb.0:
; SI-NEXT: s_and_b32 s3, s1, 0x1ff
; SI-NEXT: s_or_b32 s0, s3, s0
-; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; SI-NEXT: s_lshr_b32 s0, s1, 8
@@ -1169,7 +1168,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
; VI: ; %bb.0:
; VI-NEXT: s_and_b32 s3, s1, 0x1ff
; VI-NEXT: s_or_b32 s0, s3, s0
-; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; VI-NEXT: s_lshr_b32 s0, s1, 8
@@ -1217,7 +1215,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s3, s1, 0x1ff
; GFX9-NEXT: s_or_b32 s0, s3, s0
-; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9-NEXT: s_lshr_b32 s0, s1, 8
@@ -1264,11 +1261,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_and_b32 s3, s1, 0x1ff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_or_b32 s0, s3, s0
-; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-TRUE16-NEXT: s_cselect_b32 s0, -1, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-TRUE16-NEXT: s_bfe_u32 s0, s1, 0xb0014
; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 8
@@ -1320,11 +1315,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_and_b32 s3, s1, 0x1ff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_or_b32 s0, s3, s0
-; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-FAKE16-NEXT: s_cselect_b32 s0, -1, 0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-FAKE16-NEXT: s_bfe_u32 s0, s1, 0xb0014
; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 8
@@ -4023,7 +4016,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; SI-NEXT: s_and_b32 s6, s4, 0xffe
; SI-NEXT: s_and_b32 s4, s1, 0x1ff
; SI-NEXT: s_or_b32 s0, s4, s0
-; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: v_cvt_f16_f32_e32 v0, s5
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
@@ -4066,7 +4058,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; SI-NEXT: s_and_b32 s5, s0, 0xffe
; SI-NEXT: s_and_b32 s0, s3, 0x1ff
; SI-NEXT: s_or_b32 s0, s0, s2
-; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; SI-NEXT: v_readfirstlane_b32 s0, v2
@@ -4120,10 +4111,9 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; VI-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_lshr_b32 s5, s3, 8
-; VI-NEXT: s_and_b32 s6, s3, 0x1ff
; VI-NEXT: s_and_b32 s5, s5, 0xffe
+; VI-NEXT: s_and_b32 s6, s3, 0x1ff
; VI-NEXT: s_or_b32 s2, s6, s2
-; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cselect_b64 s[6:7], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
; VI-NEXT: s_bfe_u32 s3, s3, 0xb0014
@@ -4163,7 +4153,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; VI-NEXT: s_and_b32 s7, s2, 0xffe
; VI-NEXT: s_and_b32 s2, s1, 0x1ff
; VI-NEXT: s_or_b32 s0, s2, s0
-; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
; VI-NEXT: s_bfe_u32 s1, s1, 0xb0014
@@ -4209,10 +4198,9 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; GFX9-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshr_b32 s5, s3, 8
-; GFX9-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX9-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX9-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX9-NEXT: s_or_b32 s2, s6, s2
-; GFX9-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
; GFX9-NEXT: s_bfe_u32 s6, s3, 0xb0014
@@ -4254,7 +4242,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; GFX9-NEXT: s_and_b32 s6, s2, 0xffe
; GFX9-NEXT: s_and_b32 s2, s1, 0x1ff
; GFX9-NEXT: s_or_b32 s0, s2, s0
-; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
@@ -4301,11 +4288,10 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
;
; GFX11-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s5, s3, 0x1ff
-; GFX11-NEXT: s_lshr_b32 s6, s3, 8
-; GFX11-NEXT: s_or_b32 s2, s5, s2
-; GFX11-NEXT: s_and_b32 s5, s6, 0xffe
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-NEXT: s_lshr_b32 s5, s3, 8
+; GFX11-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX11-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-NEXT: s_or_b32 s2, s6, s2
; GFX11-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
@@ -4348,13 +4334,12 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x40f
; GFX11-NEXT: s_cselect_b32 s2, s5, s6
; GFX11-NEXT: s_lshr_b32 s3, s3, 16
-; GFX11-NEXT: s_and_b32 s6, s1, 0x1ff
; GFX11-NEXT: s_lshr_b32 s5, s1, 8
; GFX11-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX11-NEXT: s_or_b32 s0, s6, s0
+; GFX11-NEXT: s_and_b32 s6, s1, 0x1ff
; GFX11-NEXT: s_and_b32 s5, s5, 0xffe
; GFX11-NEXT: s_or_b32 s2, s3, s2
-; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_or_b32 s0, s6, s0
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
index acb32d4..11476a6 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
@@ -127,7 +127,7 @@ define amdgpu_kernel void @s_fdiv_v4f64(ptr addrspace(1) %out, <4 x double> %num
; GCN-LABEL: {{^}}div_fast_2_x_pat_f64:
; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0.5
; GCN: buffer_store_dwordx2 [[MUL]]
-define amdgpu_kernel void @div_fast_2_x_pat_f64(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @div_fast_2_x_pat_f64(ptr addrspace(1) %out) #0 {
%x = load double, ptr addrspace(1) poison
%rcp = fdiv fast double %x, 2.0
store double %rcp, ptr addrspace(1) %out, align 4
@@ -139,7 +139,7 @@ define amdgpu_kernel void @div_fast_2_x_pat_f64(ptr addrspace(1) %out) #1 {
; GCN-DAG: v_mov_b32_e32 v[[K_HI:[0-9]+]], 0x3fb99999
; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, v[[[K_LO]]:[[K_HI]]]
; GCN: buffer_store_dwordx2 [[MUL]]
-define amdgpu_kernel void @div_fast_k_x_pat_f64(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @div_fast_k_x_pat_f64(ptr addrspace(1) %out) #0 {
%x = load double, ptr addrspace(1) poison
%rcp = fdiv fast double %x, 10.0
store double %rcp, ptr addrspace(1) %out, align 4
@@ -151,7 +151,7 @@ define amdgpu_kernel void @div_fast_k_x_pat_f64(ptr addrspace(1) %out) #1 {
; GCN-DAG: v_mov_b32_e32 v[[K_HI:[0-9]+]], 0xbfb99999
; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, v[[[K_LO]]:[[K_HI]]]
; GCN: buffer_store_dwordx2 [[MUL]]
-define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(ptr addrspace(1) %out) #0 {
%x = load double, ptr addrspace(1) poison
%rcp = fdiv fast double %x, -10.0
store double %rcp, ptr addrspace(1) %out, align 4
@@ -159,4 +159,3 @@ define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(ptr addrspace(1) %out) #1 {
}
attributes #0 = { nounwind }
-attributes #1 = { nounwind "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll b/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll
index 92eb4a6..0a266bc 100644
--- a/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll
@@ -284,4 +284,4 @@ define <2 x float> @unsafe_fast_fmul_fsub_ditribute_post_legalize(float %arg0, <
ret <2 x float> %tmp1
}
-attributes #0 = { "no-infs-fp-math"="true" "unsafe-fp-math"="true" }
+attributes #0 = { "no-infs-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll b/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll
index bc85dc2..3e513de 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll
@@ -219,8 +219,8 @@ define <2 x bfloat> @v_test_fmed3_r_i_i_v2bf16_minimumnum_maximumnum(<2 x bfloat
}
attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" }
-attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
+attributes #1 = { nounwind "no-nans-fp-math"="false" }
+attributes #2 = { nounwind "no-nans-fp-math"="true" }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX11: {{.*}}
; GFX11-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 3145a27..60ac0b9 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -8905,4 +8905,4 @@ declare half @llvm.minnum.f16(half, half) #0
declare half @llvm.maxnum.f16(half, half) #0
attributes #0 = { nounwind readnone }
-attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
+attributes #2 = { nounwind "no-nans-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll
index d8bbda1..69d1ee3f 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll
@@ -159,7 +159,7 @@ declare half @llvm.amdgcn.interp.p2.f16(float, float, i32, i32, i1, i32) #0
attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind "unsafe-fp-math"="true" }
+attributes #2 = { nounwind }
attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" }
attributes #4 = { nounwind "amdgpu-ieee"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
index aaea4f7..b3202cb 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -8006,7 +8006,7 @@ declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0
attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind "unsafe-fp-math"="true" }
+attributes #2 = { nounwind }
attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN-NSZ: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
index d41e2c6..8df7564 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-FAKE16 %s
define amdgpu_kernel void @fpext_f16_to_f32(
; SI-LABEL: fpext_f16_to_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
index a43292d..a043d53 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
define amdgpu_kernel void @fptosi_f16_to_i16(
diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
index 96cb621..af1ab37 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
define amdgpu_kernel void @fptoui_f16_to_i16(
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index b0dd187..c28b25c7 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -599,10 +599,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; SI-GISEL-NEXT: s_and_b32 s6, s6, 0xffe
; SI-GISEL-NEXT: s_or_b32 s4, s7, s4
-; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; SI-GISEL-NEXT: s_or_b32 s4, s6, s4
-; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0
; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9
; SI-GISEL-NEXT: s_lshl_b32 s7, s3, 12
@@ -711,10 +709,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
; VI-GISEL-NEXT: s_or_b32 s2, s6, s2
-; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; VI-GISEL-NEXT: s_or_b32 s2, s5, s2
-; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4
; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12
@@ -824,10 +820,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX9-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
; GFX9-GISEL-NEXT: s_or_b32 s2, s6, s2
-; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; GFX9-GISEL-NEXT: s_or_b32 s2, s5, s2
-; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX9-GISEL-NEXT: s_sub_i32 s7, 1, s4
; GFX9-GISEL-NEXT: s_lshl_b32 s6, s4, 12
@@ -937,10 +931,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX950-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; GFX950-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
; GFX950-GISEL-NEXT: s_or_b32 s2, s6, s2
-; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; GFX950-GISEL-NEXT: s_or_b32 s2, s5, s2
-; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX950-GISEL-NEXT: s_sub_i32 s7, 1, s4
; GFX950-GISEL-NEXT: s_lshl_b32 s6, s4, 12
@@ -1118,17 +1110,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s5, s2
-; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s8, s2, 0x1000
@@ -1175,17 +1165,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s5, s2
-; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s8, s2, 0x1000
@@ -1366,17 +1354,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s3, 8
-; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
+; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s4, 0xfc10
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s5, s2
-; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s6, 1, s4
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s8, s2, 0x1000
@@ -1423,17 +1409,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s3, 8
-; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
+; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s4, 0xfc10
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s5, s2
-; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s6, 1, s4
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s8, s2, 0x1000
@@ -2154,10 +2138,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe
; SI-GISEL-NEXT: s_or_b32 s4, s9, s4
-; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; SI-GISEL-NEXT: s_or_b32 s4, s8, s4
-; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0
; SI-GISEL-NEXT: s_lshl_b32 s8, s8, 9
; SI-GISEL-NEXT: s_lshl_b32 s9, s3, 12
@@ -2193,12 +2175,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; SI-GISEL-NEXT: s_and_b32 s4, s4, 0x8000
; SI-GISEL-NEXT: s_addk_i32 s5, 0xfc10
; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe
-; SI-GISEL-NEXT: s_or_b32 s6, s9, s6
; SI-GISEL-NEXT: s_or_b32 s3, s4, s3
-; SI-GISEL-NEXT: s_cmp_lg_u32 s6, 0
+; SI-GISEL-NEXT: s_or_b32 s4, s9, s6
; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; SI-GISEL-NEXT: s_or_b32 s4, s8, s4
-; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0
; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9
; SI-GISEL-NEXT: s_lshl_b32 s8, s5, 12
@@ -2355,10 +2335,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; VI-GISEL-NEXT: s_addk_i32 s2, 0xfc10
; VI-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
; VI-GISEL-NEXT: s_or_b32 s4, s8, s4
-; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; VI-GISEL-NEXT: s_or_b32 s3, s3, s4
-; VI-GISEL-NEXT: s_cmp_lg_u32 s3, 0
; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; VI-GISEL-NEXT: s_sub_i32 s9, 1, s2
; VI-GISEL-NEXT: s_lshl_b32 s8, s2, 12
@@ -2392,14 +2370,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; VI-GISEL-NEXT: s_or_b32 s2, s3, s2
; VI-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
; VI-GISEL-NEXT: s_lshr_b32 s4, s7, 8
-; VI-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; VI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; VI-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
+; VI-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; VI-GISEL-NEXT: s_or_b32 s5, s5, s6
-; VI-GISEL-NEXT: s_cmp_lg_u32 s5, 0
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; VI-GISEL-NEXT: s_or_b32 s4, s4, s5
-; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; VI-GISEL-NEXT: s_sub_i32 s8, 1, s3
; VI-GISEL-NEXT: s_lshl_b32 s6, s3, 12
@@ -2555,10 +2531,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX9-GISEL-NEXT: s_addk_i32 s2, 0xfc10
; GFX9-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
; GFX9-GISEL-NEXT: s_or_b32 s4, s8, s4
-; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; GFX9-GISEL-NEXT: s_or_b32 s3, s3, s4
-; GFX9-GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; GFX9-GISEL-NEXT: s_sub_i32 s9, 1, s2
; GFX9-GISEL-NEXT: s_lshl_b32 s8, s2, 12
@@ -2592,14 +2566,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX9-GISEL-NEXT: s_or_b32 s2, s3, s2
; GFX9-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
; GFX9-GISEL-NEXT: s_lshr_b32 s4, s7, 8
-; GFX9-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; GFX9-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; GFX9-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
+; GFX9-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; GFX9-GISEL-NEXT: s_or_b32 s5, s5, s6
-; GFX9-GISEL-NEXT: s_cmp_lg_u32 s5, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX9-GISEL-NEXT: s_or_b32 s4, s4, s5
-; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX9-GISEL-NEXT: s_sub_i32 s8, 1, s3
; GFX9-GISEL-NEXT: s_lshl_b32 s6, s3, 12
@@ -2752,10 +2724,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX950-GISEL-NEXT: s_addk_i32 s2, 0xfc10
; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
; GFX950-GISEL-NEXT: s_or_b32 s4, s8, s4
-; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; GFX950-GISEL-NEXT: s_or_b32 s3, s3, s4
-; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; GFX950-GISEL-NEXT: s_sub_i32 s9, 1, s2
; GFX950-GISEL-NEXT: s_lshl_b32 s8, s2, 12
@@ -2789,14 +2759,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX950-GISEL-NEXT: s_or_b32 s2, s3, s2
; GFX950-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
; GFX950-GISEL-NEXT: s_lshr_b32 s4, s7, 8
-; GFX950-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; GFX950-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
+; GFX950-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; GFX950-GISEL-NEXT: s_or_b32 s5, s5, s6
-; GFX950-GISEL-NEXT: s_cmp_lg_u32 s5, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX950-GISEL-NEXT: s_or_b32 s4, s4, s5
-; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX950-GISEL-NEXT: s_sub_i32 s8, 1, s3
; GFX950-GISEL-NEXT: s_lshl_b32 s6, s3, 12
@@ -3073,17 +3041,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 8
-; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s2, 0xfc10
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0xffe
-; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s4
-; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s8, 1, s2
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s10, s3, 0x1000
@@ -3115,19 +3081,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-GISEL-TRUE16-NEXT: s_cmpk_eq_i32 s2, 0x40f
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s4, s3
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 16
-; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s7, 8
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, 0
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s8, s6
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 1, 0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s5, s3
-; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s9, s3, 0x1000
@@ -3176,17 +3140,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 8
-; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s2, 0xfc10
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0xffe
-; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s4
-; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s8, 1, s2
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s10, s3, 0x1000
@@ -3218,19 +3180,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-GISEL-FAKE16-NEXT: s_cmpk_eq_i32 s2, 0x40f
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s4, s3
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 16
-; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s7, 8
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, 0
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s8, s6
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 1, 0
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s5, s3
-; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s9, s3, 0x1000
@@ -3511,17 +3471,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 8
-; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
+; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s2, 0xfc10
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0xffe
-; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s4
-; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s8, 1, s2
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s10, s3, 0x1000
@@ -3553,19 +3511,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX1250-GISEL-TRUE16-NEXT: s_cmp_eq_u32 s2, 0x40f
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s4, s3
; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 16
-; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s7, 8
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6
+; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s4, 0xfc10
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2
-; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s8, s6
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s5, s3
-; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s6, 1, s4
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s9, s3, 0x1000
@@ -3614,17 +3570,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 8
-; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
+; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s2, 0xfc10
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0xffe
-; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s4
-; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s8, 1, s2
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s10, s3, 0x1000
@@ -3656,19 +3610,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX1250-GISEL-FAKE16-NEXT: s_cmp_eq_u32 s2, 0x40f
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s4, s3
; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 16
-; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s7, 8
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6
+; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s4, 0xfc10
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2
-; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s8, s6
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s5, s3
-; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s6, 1, s4
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s9, s3, 0x1000
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 5d31177..b6b26a4 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -2,14 +2,14 @@
; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-SDAG,VI-SAFE-SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-GISEL,VI-SAFE-GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI-SDAG,VI-UNSAFE-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-SDAG,VI-UNSAFE-SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-SAFE-SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-GISEL,GFX10-SAFE-GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-UNSAFE-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-UNSAFE-SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-SAFE-SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-SAFE-GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-FAKE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-FAKE16 %s
@@ -182,7 +182,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; SI-NEXT: s_and_b32 s1, s7, 0x1ff
; SI-NEXT: s_and_b32 s8, s0, 0xffe
; SI-NEXT: s_or_b32 s0, s1, s6
-; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014
@@ -237,7 +236,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; VI-SDAG-NEXT: s_and_b32 s8, s4, 0xffe
; VI-SDAG-NEXT: s_and_b32 s4, s7, 0x1ff
; VI-SDAG-NEXT: s_or_b32 s4, s4, s6
-; VI-SDAG-NEXT: s_cmp_lg_u32 s4, 0
; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0
; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
@@ -290,10 +288,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
; VI-GISEL-NEXT: s_or_b32 s2, s6, s2
-; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; VI-GISEL-NEXT: s_or_b32 s2, s5, s2
-; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4
; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12
@@ -335,11 +331,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff
-; GFX10-SDAG-NEXT: s_lshr_b32 s5, s3, 8
-; GFX10-SDAG-NEXT: s_or_b32 s2, s4, s2
-; GFX10-SDAG-NEXT: s_and_b32 s4, s5, 0xffe
-; GFX10-SDAG-NEXT: s_cmp_lg_u32 s2, 0
+; GFX10-SDAG-NEXT: s_lshr_b32 s4, s3, 8
+; GFX10-SDAG-NEXT: s_and_b32 s5, s3, 0x1ff
+; GFX10-SDAG-NEXT: s_and_b32 s4, s4, 0xffe
+; GFX10-SDAG-NEXT: s_or_b32 s2, s5, s2
; GFX10-SDAG-NEXT: s_cselect_b32 s2, -1, 0
; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
; GFX10-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014
@@ -387,16 +382,14 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX10-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX10-GISEL-NEXT: s_lshr_b32 s5, s3, 8
-; GFX10-GISEL-NEXT: s_or_b32 s2, s6, s2
+; GFX10-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX10-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; GFX10-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX10-GISEL-NEXT: s_or_b32 s2, s6, s2
; GFX10-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; GFX10-GISEL-NEXT: s_or_b32 s2, s5, s2
-; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX10-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX10-GISEL-NEXT: s_sub_i32 s6, 1, s4
; GFX10-GISEL-NEXT: s_or_b32 s8, s2, 0x1000
@@ -438,11 +431,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff
-; GFX11-SDAG-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-SDAG-NEXT: s_or_b32 s2, s4, s2
-; GFX11-SDAG-NEXT: s_and_b32 s4, s5, 0xffe
-; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT: s_lshr_b32 s4, s3, 8
+; GFX11-SDAG-NEXT: s_and_b32 s5, s3, 0x1ff
+; GFX11-SDAG-NEXT: s_and_b32 s4, s4, 0xffe
+; GFX11-SDAG-NEXT: s_or_b32 s2, s5, s2
; GFX11-SDAG-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
@@ -498,17 +490,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX11-GISEL-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-NEXT: s_or_b32 s2, s6, s2
+; GFX11-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-NEXT: s_or_b32 s2, s6, s2
; GFX11-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: s_or_b32 s2, s5, s2
-; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-NEXT: s_or_b32 s8, s2, 0x1000
diff --git a/llvm/test/CodeGen/AMDGPU/fract.f64.ll b/llvm/test/CodeGen/AMDGPU/fract.f64.ll
index f09c1c6..cc2e78d 100644
--- a/llvm/test/CodeGen/AMDGPU/fract.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract.f64.ll
@@ -2,8 +2,8 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -enable-unsafe-fp-math < %s | FileCheck --check-prefixes=GCN,SI,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck --check-prefixes=GCN,SI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s
declare double @llvm.fabs.f64(double) #0
declare double @llvm.floor.f64(double) #0
diff --git a/llvm/test/CodeGen/AMDGPU/fract.ll b/llvm/test/CodeGen/AMDGPU/fract.ll
index 8ef0fcf..723fd93 100644
--- a/llvm/test/CodeGen/AMDGPU/fract.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract.ll
@@ -1,8 +1,8 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck --check-prefix=GCN %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefix=GCN %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -enable-unsafe-fp-math < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefix=GCN %s
declare float @llvm.fabs.f32(float) #0
declare float @llvm.floor.f32(float) #0
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 6f91222..d8cbdb1 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -2048,7 +2048,7 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
; GFX1200-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX1200-FAKE16-NEXT: s_endpgm
- ptr addrspace(1) %in2) #1 {
+ ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
%r0 = load half, ptr addrspace(1) %in1, align 4
%r1 = load half, ptr addrspace(1) %gep2, align 4
@@ -3417,7 +3417,7 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
; GFX1200-NEXT: v_fmac_f32_e32 v1, v3, v2
; GFX1200-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1200-NEXT: s_endpgm
- ptr addrspace(1) %in2) #1 {
+ ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
%r0 = load float, ptr addrspace(1) %in1, align 4
%r1 = load float, ptr addrspace(1) %gep2, align 4
@@ -4821,7 +4821,7 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
; GFX1200-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
; GFX1200-NEXT: global_store_b64 v12, v[0:1], s[0:1]
; GFX1200-NEXT: s_endpgm
- ptr addrspace(1) %in2) #1 {
+ ptr addrspace(1) %in2) #0 {
%r0 = load double, ptr addrspace(1) %in1, align 8
%r1 = load double, ptr addrspace(1) %in2, align 8
%r2 = frem afn double %r0, %r1
@@ -18918,7 +18918,4 @@ define amdgpu_kernel void @frem_v2f64_const(ptr addrspace(1) %out) #0 {
-attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
-attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
-
-
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
index 1b74ddf..9b97981 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -2870,7 +2870,7 @@ define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
ret double %result
}
-define double @v_sqrt_f64__unsafe_attr(double %x) #4 {
+define double @v_sqrt_f64__unsafe_attr(double %x) {
; GFX6-SDAG-LABEL: v_sqrt_f64__unsafe_attr:
; GFX6-SDAG: ; %bb.0:
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3449,7 +3449,6 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) #1
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #1 = { convergent nounwind willreturn memory(none) }
attributes #3 = { "no-nans-fp-math"="true" "no-infs-fp-math"="true" }
-attributes #4 = { "unsafe-fp-math"="true" }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX6: {{.*}}
; GFX8: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll
index 9f19bcb..c93c077 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll
@@ -239,4 +239,4 @@ declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) #0
declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) #0
attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind "unsafe-fp-math"="true" }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 37756d1..31f277f 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -472,7 +472,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -536,11 +535,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -606,7 +604,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -660,12 +657,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -710,9 +706,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1690,7 +1685,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1754,11 +1748,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1824,7 +1817,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1878,12 +1870,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1928,9 +1919,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2968,7 +2958,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3032,11 +3021,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3102,7 +3090,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3156,12 +3143,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3206,9 +3192,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3742,7 +3727,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB6_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3806,11 +3790,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3876,7 +3859,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3930,12 +3912,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3980,9 +3961,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5019,7 +4999,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5083,11 +5062,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5153,7 +5131,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5207,12 +5184,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5270,9 +5246,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6284,7 +6259,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB10_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6354,7 +6328,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6424,7 +6397,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6485,8 +6457,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6550,7 +6520,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7717,7 +7686,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB12_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7787,7 +7755,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7857,7 +7824,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7918,8 +7884,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7983,7 +7947,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9150,7 +9113,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB14_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9220,7 +9182,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9290,7 +9251,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9351,8 +9311,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9416,7 +9374,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10065,7 +10022,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB15_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10135,7 +10091,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10205,7 +10160,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10266,8 +10220,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10331,7 +10283,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11498,7 +11449,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB17_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11568,7 +11518,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11638,7 +11587,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11699,8 +11647,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11764,7 +11710,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 6351bb3..4581efc 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -381,13 +381,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX9-NEXT: .LBB1_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: v_readlane_b32 s3, v0, s2
+; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v1
; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -457,7 +456,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -513,7 +511,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
@@ -562,8 +559,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -610,11 +606,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1420,13 +1414,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX9-NEXT: .LBB3_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: v_readlane_b32 s3, v0, s2
+; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v1
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1496,7 +1489,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1552,7 +1544,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1601,8 +1592,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1649,11 +1639,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2459,13 +2447,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX9-NEXT: .LBB5_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: v_readlane_b32 s3, v0, s2
+; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v1
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2535,7 +2522,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2591,7 +2577,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
@@ -2640,8 +2625,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2688,11 +2672,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3591,7 +3573,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3665,7 +3646,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3724,7 +3704,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3774,8 +3753,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3841,10 +3819,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4859,7 +4836,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB9_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4933,7 +4909,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4992,7 +4967,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5042,8 +5016,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5109,10 +5082,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6127,7 +6099,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB11_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6201,7 +6172,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6260,7 +6230,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6310,8 +6279,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6377,10 +6345,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index a9ac008..bd570d9 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -381,13 +381,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX9-NEXT: .LBB1_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: v_readlane_b32 s3, v0, s2
+; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX9-NEXT: v_min_f32_e32 v2, v2, v1
; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -457,7 +456,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -513,7 +511,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
@@ -562,8 +559,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -610,11 +606,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1420,13 +1414,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX9-NEXT: .LBB3_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: v_readlane_b32 s3, v0, s2
+; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX9-NEXT: v_min_f32_e32 v2, v2, v1
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1496,7 +1489,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1552,7 +1544,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1601,8 +1592,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1649,11 +1639,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2459,13 +2447,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX9-NEXT: .LBB5_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: v_readlane_b32 s3, v0, s2
+; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX9-NEXT: v_min_f32_e32 v2, v2, v1
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2535,7 +2522,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2591,7 +2577,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
@@ -2640,8 +2625,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2688,11 +2672,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3591,7 +3573,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3665,7 +3646,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3724,7 +3704,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3774,8 +3753,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3841,10 +3819,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4859,7 +4836,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB9_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4933,7 +4909,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4992,7 +4967,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5042,8 +5016,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5109,10 +5082,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6127,7 +6099,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB11_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6201,7 +6172,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6260,7 +6230,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6310,8 +6279,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6377,10 +6345,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 6311143..1f2d70c 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -532,7 +532,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -596,11 +595,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -666,7 +664,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -720,12 +717,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -783,9 +779,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1862,7 +1857,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1926,11 +1920,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1996,7 +1989,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2050,12 +2042,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2113,9 +2104,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3192,7 +3182,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3256,11 +3245,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3326,7 +3314,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3380,12 +3367,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3443,9 +3429,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4018,7 +4003,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB6_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4082,11 +4066,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4152,7 +4135,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4206,12 +4188,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4269,9 +4250,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5347,7 +5327,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5411,11 +5390,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5481,7 +5459,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5535,12 +5512,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5598,9 +5574,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6612,7 +6587,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB10_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6682,7 +6656,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6752,7 +6725,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6813,8 +6785,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6878,7 +6848,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8044,7 +8013,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB12_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8114,7 +8082,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8184,7 +8151,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8245,8 +8211,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8310,7 +8274,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9477,7 +9440,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB14_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9547,7 +9509,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9617,7 +9578,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9678,8 +9638,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9743,7 +9701,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10392,7 +10349,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB15_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10462,7 +10418,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10532,7 +10487,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10593,8 +10547,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10658,7 +10610,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11824,7 +11775,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB17_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11894,7 +11844,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11964,7 +11913,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12025,8 +11973,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12090,7 +12036,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/inline-attr.ll b/llvm/test/CodeGen/AMDGPU/inline-attr.ll
index 4ae0ba0..c33b3344 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-attr.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-attr.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 -enable-unsafe-fp-math %s | FileCheck --check-prefixes=GCN,UNSAFE %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 %s | FileCheck --check-prefixes=GCN,UNSAFE %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 -enable-no-nans-fp-math %s | FileCheck --check-prefixes=GCN,NONANS %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 -enable-no-infs-fp-math %s | FileCheck --check-prefixes=GCN,NOINFS %s
@@ -36,18 +36,18 @@ entry:
ret void
}
-attributes #0 = { nounwind "uniform-work-group-size"="false" "unsafe-fp-math"="true"}
-attributes #1 = { nounwind "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" }
+attributes #0 = { nounwind "uniform-work-group-size"="false"}
+attributes #1 = { nounwind "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" }
;.
-; UNSAFE: attributes #[[ATTR0]] = { nounwind "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
-; UNSAFE: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
+; UNSAFE: attributes #[[ATTR0]] = { nounwind "uniform-work-group-size"="false" }
+; UNSAFE: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "uniform-work-group-size"="false" }
;.
-; NONANS: attributes #[[ATTR0]] = { nounwind "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
-; NONANS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
+; NONANS: attributes #[[ATTR0]] = { nounwind "no-nans-fp-math"="true" "uniform-work-group-size"="false" }
+; NONANS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "uniform-work-group-size"="false" }
;.
-; NOINFS: attributes #[[ATTR0]] = { nounwind "no-infs-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
-; NOINFS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
+; NOINFS: attributes #[[ATTR0]] = { nounwind "no-infs-fp-math"="true" "uniform-work-group-size"="false" }
+; NOINFS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "uniform-work-group-size"="false" }
;.
; UNSAFE: [[META0]] = !{}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index eee232a..c3f3917 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -136,19 +136,17 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: .LBB2_6: ; %bb18
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: v_readfirstlane_b32 s13, v0
-; GFX11-NEXT: s_cmp_lg_u32 s1, 0
-; GFX11-NEXT: s_cselect_b32 s1, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1
-; GFX11-NEXT: s_and_b32 s1, s8, s1
-; GFX11-NEXT: s_and_b32 s1, s1, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_cselect_b32 s13, -1, 0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13
+; GFX11-NEXT: s_and_b32 s13, s8, s13
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_and_b32 s13, s13, exec_lo
; GFX11-NEXT: v_readfirstlane_b32 s19, v2
-; GFX11-NEXT: s_cselect_b32 s1, s19, s13
-; GFX11-NEXT: s_and_b32 s13, 0xffff, s0
+; GFX11-NEXT: s_cselect_b32 s1, s19, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s1, s1, 1
-; GFX11-NEXT: s_cmp_lg_u32 s13, 0
+; GFX11-NEXT: s_and_b32 s13, 0xffff, s0
; GFX11-NEXT: s_cselect_b32 s13, -1, 0
; GFX11-NEXT: s_and_b32 s20, s9, exec_lo
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13
diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
index b91963f..d23509b 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
@@ -150,7 +150,6 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
; GFX10CU-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
; GFX10CU-NEXT: v_mov_b32_e32 v0, s13
; GFX10CU-NEXT: s_waitcnt vmcnt(0)
-; GFX10CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX10CU-NEXT: s_barrier
; GFX10CU-NEXT: ds_read_b32 v0, v0
; GFX10CU-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.add.min.max.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.add.min.max.ll
new file mode 100644
index 0000000..99421d4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.add.min.max.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250-GISEL %s
+
+declare i32 @llvm.amdgcn.add.min.i32(i32, i32, i32, i1)
+declare i32 @llvm.amdgcn.add.max.i32(i32, i32, i32, i1)
+declare i32 @llvm.amdgcn.add.min.u32(i32, i32, i32, i1)
+declare i32 @llvm.amdgcn.add.max.u32(i32, i32, i32, i1)
+declare <2 x i16> @llvm.amdgcn.pk.add.min.i16(<2 x i16>, <2 x i16>, <2 x i16>, i1)
+declare <2 x i16> @llvm.amdgcn.pk.add.max.i16(<2 x i16>, <2 x i16>, <2 x i16>, i1)
+declare <2 x i16> @llvm.amdgcn.pk.add.min.u16(<2 x i16>, <2 x i16>, <2 x i16>, i1)
+declare <2 x i16> @llvm.amdgcn.pk.add.max.u16(<2 x i16>, <2 x i16>, <2 x i16>, i1)
+
+define i32 @test_add_min_i32_vvv(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_add_min_i32_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_add_min_i32 v0, v0, v1, v2
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.add.min.i32(i32 %a, i32 %b, i32 %c, i1 0)
+ ret i32 %ret
+}
+
+define i32 @test_add_min_i32_ssi_clamp(i32 inreg %a, i32 inreg %b) {
+; GCN-LABEL: test_add_min_i32_ssi_clamp:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_add_min_i32 v0, s0, s1, 1 clamp
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.add.min.i32(i32 %a, i32 %b, i32 1, i1 1)
+ ret i32 %ret
+}
+
+define i32 @test_add_min_u32_vvv(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_add_min_u32_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_add_min_u32 v0, v0, v1, v2
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.add.min.u32(i32 %a, i32 %b, i32 %c, i1 0)
+ ret i32 %ret
+}
+
+define i32 @test_add_min_u32_ssi_clamp(i32 inreg %a, i32 inreg %b) {
+; GCN-LABEL: test_add_min_u32_ssi_clamp:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_add_min_u32 v0, s0, s1, 1 clamp
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.add.min.u32(i32 %a, i32 %b, i32 1, i1 1)
+ ret i32 %ret
+}
+
+define i32 @test_add_max_i32_vvv(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_add_max_i32_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_add_max_i32 v0, v0, v1, v2
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.add.max.i32(i32 %a, i32 %b, i32 %c, i1 0)
+ ret i32 %ret
+}
+
+define i32 @test_add_max_i32_ssi_clamp(i32 inreg %a, i32 inreg %b) {
+; GCN-LABEL: test_add_max_i32_ssi_clamp:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_add_max_i32 v0, s0, s1, 1 clamp
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.add.max.i32(i32 %a, i32 %b, i32 1, i1 1)
+ ret i32 %ret
+}
+
+define i32 @test_add_max_u32_vvv(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_add_max_u32_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_add_max_u32 v0, v0, v1, v2
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.add.max.u32(i32 %a, i32 %b, i32 %c, i1 0)
+ ret i32 %ret
+}
+
+define i32 @test_add_max_u32_ssi_clamp(i32 inreg %a, i32 inreg %b) {
+; GCN-LABEL: test_add_max_u32_ssi_clamp:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_add_max_u32 v0, s0, s1, 1 clamp
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.add.max.u32(i32 %a, i32 %b, i32 1, i1 1)
+ ret i32 %ret
+}
+
+define <2 x i16> @test_add_min_i16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+; GCN-LABEL: test_add_min_i16_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_pk_add_min_i16 v0, v0, v1, v2
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0)
+ ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_min_i16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) {
+; GCN-LABEL: test_add_min_i16_ssi_clamp:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_pk_add_min_i16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1)
+ ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_min_u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+; GCN-LABEL: test_add_min_u16_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_pk_add_min_u16 v0, v0, v1, v2
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0)
+ ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_min_u16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) {
+; GCN-LABEL: test_add_min_u16_ssi_clamp:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_pk_add_min_u16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1)
+ ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_max_i16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+; GCN-LABEL: test_add_max_i16_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_pk_add_max_i16 v0, v0, v1, v2
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0)
+ ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_max_i16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) {
+; GCN-LABEL: test_add_max_i16_ssi_clamp:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_pk_add_max_i16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1)
+ ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_max_u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+; GCN-LABEL: test_add_max_u16_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_pk_add_max_u16 v0, v0, v1, v2
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0)
+ ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_max_u16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) {
+; GCN-LABEL: test_add_max_u16_ssi_clamp:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_pk_add_max_u16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1)
+ ret <2 x i16> %ret
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1250-GISEL: {{.*}}
+; GFX1250-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
index e8b8d05..e8eccb0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
@@ -147,14 +147,13 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
; GFX8-OPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8-OPT-NEXT: v_mov_b32_e32 v2, 0
; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-OPT-NEXT: s_barrier
-; GFX8-OPT-NEXT: v_add_u32_e32 v1, vcc, v1, v1
-; GFX8-OPT-NEXT: s_nop 1
-; GFX8-OPT-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
-; GFX8-OPT-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-OPT-NEXT: v_add_u32_e32 v4, vcc, v1, v1
+; GFX8-OPT-NEXT: v_mov_b32_e32 v3, s1
; GFX8-OPT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-OPT-NEXT: v_mov_b32_dpp v2, v4 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
+; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-OPT-NEXT: v_add_u32_e32 v2, vcc, v2, v4
+; GFX8-OPT-NEXT: s_barrier
; GFX8-OPT-NEXT: flat_store_dword v[0:1], v2
; GFX8-OPT-NEXT: s_endpgm
;
@@ -194,14 +193,14 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: ds_read_b32 v1, v0
-; GFX10-NEXT: s_barrier
-; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0
-; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v1
-; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
-; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v3, v1, v1
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
+; GFX10-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
+; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX10-NEXT: s_barrier
+; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: flat_store_dword v[0:1], v2
; GFX10-NEXT: s_endpgm
;
@@ -213,15 +212,15 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX11-NEXT: ds_load_b32 v1, v0
-; GFX11-NEXT: s_barrier
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v3, v1, v1
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
+; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX11-NEXT: s_barrier
+; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 883db20..e30a586 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -1485,7 +1485,7 @@ define float @v_exp2_f32_fast(float %in) {
ret float %result
}
-define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
+define float @v_exp2_f32_unsafe_math_attr(float %in) {
; SI-SDAG-LABEL: v_exp2_f32_unsafe_math_attr:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 0854134..61a777f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -1907,7 +1907,7 @@ define float @v_log2_f32_fast(float %in) {
ret float %result
}
-define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
+define float @v_log2_f32_unsafe_math_attr(float %in) {
; SI-SDAG-LABEL: v_log2_f32_unsafe_math_attr:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index 8748aff..6dc9199 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -8265,12 +8265,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_readlane_b32 s6, v1, s3
-; GFX12-NEXT: s_lshl_b32 s7, 1, s3
; GFX12-NEXT: v_writelane_b32 v0, s0, s3
+; GFX12-NEXT: s_lshl_b32 s3, 1, s3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 s1, s1, s7
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12-NEXT: s_and_not1_b32 s1, s1, s3
; GFX12-NEXT: s_add_f32 s0, s0, s6
; GFX12-NEXT: s_cbranch_scc1 .LBB28_5
; GFX12-NEXT: ; %bb.6: ; %ComputeEnd
@@ -8351,14 +8349,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX942-NEXT: .LBB28_5: ; %ComputeLoop
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX942-NEXT: v_readfirstlane_b32 s8, v1
-; GFX942-NEXT: v_readlane_b32 s9, v2, s3
+; GFX942-NEXT: v_readfirstlane_b32 s6, v1
; GFX942-NEXT: s_mov_b32 m0, s3
+; GFX942-NEXT: v_readlane_b32 s8, v2, s3
+; GFX942-NEXT: v_writelane_b32 v0, s6, m0
+; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX942-NEXT: v_writelane_b32 v0, s8, m0
-; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX942-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX942-NEXT: v_add_f32_e32 v1, s8, v1
; GFX942-NEXT: s_cbranch_scc1 .LBB28_5
; GFX942-NEXT: ; %bb.6: ; %ComputeEnd
; GFX942-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8440,15 +8437,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX11-NEXT: .LBB28_5: ; %ComputeLoop
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_ctz_i32_b32 s1, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readlane_b32 s6, v2, s1
-; GFX11-NEXT: s_lshl_b32 s7, 1, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 s0, s0, s7
; GFX11-NEXT: v_writelane_b32 v0, s3, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_add_f32_e32 v1, s6, v1
-; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_lshl_b32 s1, 1, s1
+; GFX11-NEXT: s_and_not1_b32 s0, s0, s1
; GFX11-NEXT: s_cbranch_scc1 .LBB28_5
; GFX11-NEXT: ; %bb.6: ; %ComputeEnd
; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8528,11 +8524,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX10-NEXT: s_ff1_i32_b32 s1, s0
; GFX10-NEXT: v_readfirstlane_b32 s3, v1
; GFX10-NEXT: v_readlane_b32 s6, v2, s1
-; GFX10-NEXT: s_lshl_b32 s7, 1, s1
-; GFX10-NEXT: s_andn2_b32 s0, s0, s7
; GFX10-NEXT: v_writelane_b32 v0, s3, s1
; GFX10-NEXT: v_add_f32_e32 v1, s6, v1
-; GFX10-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10-NEXT: s_lshl_b32 s1, 1, s1
+; GFX10-NEXT: s_andn2_b32 s0, s0, s1
; GFX10-NEXT: s_cbranch_scc1 .LBB28_5
; GFX10-NEXT: ; %bb.6: ; %ComputeEnd
; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8609,14 +8604,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX90A-NEXT: .LBB28_5: ; %ComputeLoop
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX90A-NEXT: v_readfirstlane_b32 s8, v1
-; GFX90A-NEXT: v_readlane_b32 s9, v2, s3
+; GFX90A-NEXT: v_readfirstlane_b32 s6, v1
; GFX90A-NEXT: s_mov_b32 m0, s3
+; GFX90A-NEXT: v_readlane_b32 s8, v2, s3
+; GFX90A-NEXT: v_writelane_b32 v0, s6, m0
+; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX90A-NEXT: v_writelane_b32 v0, s8, m0
-; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX90A-NEXT: v_add_f32_e32 v1, s8, v1
; GFX90A-NEXT: s_cbranch_scc1 .LBB28_5
; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd
; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8692,14 +8686,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX908-NEXT: .LBB28_5: ; %ComputeLoop
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX908-NEXT: v_readfirstlane_b32 s8, v1
-; GFX908-NEXT: v_readlane_b32 s9, v2, s3
+; GFX908-NEXT: v_readfirstlane_b32 s6, v1
; GFX908-NEXT: s_mov_b32 m0, s3
+; GFX908-NEXT: v_readlane_b32 s8, v2, s3
+; GFX908-NEXT: v_writelane_b32 v0, s6, m0
+; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX908-NEXT: v_writelane_b32 v0, s8, m0
-; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX908-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX908-NEXT: v_add_f32_e32 v1, s8, v1
; GFX908-NEXT: s_cbranch_scc1 .LBB28_5
; GFX908-NEXT: ; %bb.6: ; %ComputeEnd
; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8776,14 +8769,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX8-NEXT: .LBB28_5: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX8-NEXT: v_readfirstlane_b32 s8, v1
-; GFX8-NEXT: v_readlane_b32 s9, v2, s3
+; GFX8-NEXT: v_readfirstlane_b32 s6, v1
; GFX8-NEXT: s_mov_b32 m0, s3
+; GFX8-NEXT: v_readlane_b32 s8, v2, s3
+; GFX8-NEXT: v_writelane_b32 v0, s6, m0
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: v_writelane_b32 v0, s8, m0
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX8-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX8-NEXT: v_add_f32_e32 v1, s8, v1
; GFX8-NEXT: s_cbranch_scc1 .LBB28_5
; GFX8-NEXT: ; %bb.6: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9130,12 +9122,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_readlane_b32 s6, v1, s3
-; GFX12-NEXT: s_lshl_b32 s7, 1, s3
; GFX12-NEXT: v_writelane_b32 v0, s0, s3
+; GFX12-NEXT: s_lshl_b32 s3, 1, s3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 s1, s1, s7
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12-NEXT: s_and_not1_b32 s1, s1, s3
; GFX12-NEXT: s_add_f32 s0, s0, s6
; GFX12-NEXT: s_cbranch_scc1 .LBB29_5
; GFX12-NEXT: ; %bb.6: ; %ComputeEnd
@@ -9212,14 +9202,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX942-NEXT: .LBB29_5: ; %ComputeLoop
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX942-NEXT: v_readfirstlane_b32 s8, v1
-; GFX942-NEXT: v_readlane_b32 s9, v2, s3
+; GFX942-NEXT: v_readfirstlane_b32 s6, v1
; GFX942-NEXT: s_mov_b32 m0, s3
+; GFX942-NEXT: v_readlane_b32 s8, v2, s3
+; GFX942-NEXT: v_writelane_b32 v0, s6, m0
+; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX942-NEXT: v_writelane_b32 v0, s8, m0
-; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX942-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX942-NEXT: v_add_f32_e32 v1, s8, v1
; GFX942-NEXT: s_cbranch_scc1 .LBB29_5
; GFX942-NEXT: ; %bb.6: ; %ComputeEnd
; GFX942-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9296,15 +9285,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: .LBB29_5: ; %ComputeLoop
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_ctz_i32_b32 s1, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readlane_b32 s6, v2, s1
-; GFX11-NEXT: s_lshl_b32 s7, 1, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 s0, s0, s7
; GFX11-NEXT: v_writelane_b32 v0, s3, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_add_f32_e32 v1, s6, v1
-; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_lshl_b32 s1, 1, s1
+; GFX11-NEXT: s_and_not1_b32 s0, s0, s1
; GFX11-NEXT: s_cbranch_scc1 .LBB29_5
; GFX11-NEXT: ; %bb.6: ; %ComputeEnd
; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9377,11 +9365,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX10-NEXT: s_ff1_i32_b32 s1, s0
; GFX10-NEXT: v_readfirstlane_b32 s3, v1
; GFX10-NEXT: v_readlane_b32 s6, v2, s1
-; GFX10-NEXT: s_lshl_b32 s7, 1, s1
-; GFX10-NEXT: s_andn2_b32 s0, s0, s7
; GFX10-NEXT: v_writelane_b32 v0, s3, s1
; GFX10-NEXT: v_add_f32_e32 v1, s6, v1
-; GFX10-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10-NEXT: s_lshl_b32 s1, 1, s1
+; GFX10-NEXT: s_andn2_b32 s0, s0, s1
; GFX10-NEXT: s_cbranch_scc1 .LBB29_5
; GFX10-NEXT: ; %bb.6: ; %ComputeEnd
; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9453,14 +9440,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX90A-NEXT: .LBB29_5: ; %ComputeLoop
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX90A-NEXT: v_readfirstlane_b32 s8, v1
-; GFX90A-NEXT: v_readlane_b32 s9, v2, s3
+; GFX90A-NEXT: v_readfirstlane_b32 s6, v1
; GFX90A-NEXT: s_mov_b32 m0, s3
+; GFX90A-NEXT: v_readlane_b32 s8, v2, s3
+; GFX90A-NEXT: v_writelane_b32 v0, s6, m0
+; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX90A-NEXT: v_writelane_b32 v0, s8, m0
-; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX90A-NEXT: v_add_f32_e32 v1, s8, v1
; GFX90A-NEXT: s_cbranch_scc1 .LBB29_5
; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd
; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9533,14 +9519,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX908-NEXT: .LBB29_5: ; %ComputeLoop
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX908-NEXT: v_readfirstlane_b32 s8, v1
-; GFX908-NEXT: v_readlane_b32 s9, v2, s3
+; GFX908-NEXT: v_readfirstlane_b32 s6, v1
; GFX908-NEXT: s_mov_b32 m0, s3
+; GFX908-NEXT: v_readlane_b32 s8, v2, s3
+; GFX908-NEXT: v_writelane_b32 v0, s6, m0
+; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX908-NEXT: v_writelane_b32 v0, s8, m0
-; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX908-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX908-NEXT: v_add_f32_e32 v1, s8, v1
; GFX908-NEXT: s_cbranch_scc1 .LBB29_5
; GFX908-NEXT: ; %bb.6: ; %ComputeEnd
; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9614,14 +9599,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: .LBB29_5: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX8-NEXT: v_readfirstlane_b32 s8, v1
-; GFX8-NEXT: v_readlane_b32 s9, v2, s3
+; GFX8-NEXT: v_readfirstlane_b32 s6, v1
; GFX8-NEXT: s_mov_b32 m0, s3
+; GFX8-NEXT: v_readlane_b32 s8, v2, s3
+; GFX8-NEXT: v_writelane_b32 v0, s6, m0
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: v_writelane_b32 v0, s8, m0
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX8-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX8-NEXT: v_add_f32_e32 v1, s8, v1
; GFX8-NEXT: s_cbranch_scc1 .LBB29_5
; GFX8-NEXT: ; %bb.6: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
index 516c3946..282a7ae 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
@@ -15,7 +15,6 @@ define amdgpu_kernel void @test_s_barrier() {
;
; GFX10-CU-LABEL: test_s_barrier:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-CU-NEXT: s_barrier
; GFX10-CU-NEXT: s_endpgm
;
@@ -26,7 +25,6 @@ define amdgpu_kernel void @test_s_barrier() {
;
; GFX11-CU-LABEL: test_s_barrier:
; GFX11-CU: ; %bb.0: ; %entry
-; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX11-CU-NEXT: s_barrier
; GFX11-CU-NEXT: s_endpgm
;
@@ -38,7 +36,6 @@ define amdgpu_kernel void @test_s_barrier() {
;
; GFX12-CU-LABEL: test_s_barrier:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: s_wait_alu 0xffe3
; GFX12-CU-NEXT: s_barrier_signal -1
; GFX12-CU-NEXT: s_barrier_wait -1
; GFX12-CU-NEXT: s_endpgm
@@ -63,8 +60,8 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
;
; GFX10-CU-LABEL: test_s_barrier_workgroup_fence:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_barrier
; GFX10-CU-NEXT: s_endpgm
;
@@ -77,8 +74,8 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
;
; GFX11-CU-LABEL: test_s_barrier_workgroup_fence:
; GFX11-CU: ; %bb.0: ; %entry
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_barrier
; GFX11-CU-NEXT: s_endpgm
;
@@ -94,8 +91,10 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
;
; GFX12-CU-LABEL: test_s_barrier_workgroup_fence:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
-; GFX12-CU-NEXT: s_wait_alu 0xffe3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_barrier_signal -1
; GFX12-CU-NEXT: s_barrier_wait -1
; GFX12-CU-NEXT: s_endpgm
@@ -125,7 +124,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() {
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-CU-NEXT: s_barrier
; GFX10-CU-NEXT: s_endpgm
;
@@ -140,7 +138,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() {
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX11-CU-NEXT: s_barrier
; GFX11-CU-NEXT: s_endpgm
;
@@ -160,7 +157,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() {
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-CU-NEXT: s_wait_alu 0xffe3
; GFX12-CU-NEXT: s_barrier_signal -1
; GFX12-CU-NEXT: s_barrier_wait -1
; GFX12-CU-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
index 6a76f43..7efbff9 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
@@ -107,6 +107,8 @@ define amdgpu_kernel void @workgroup_release_fence() {
;
; GFX10-CU-LABEL: workgroup_release_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_release_fence:
@@ -139,6 +141,8 @@ define amdgpu_kernel void @workgroup_release_fence() {
;
; GFX11-CU-LABEL: workgroup_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_release_fence:
@@ -151,6 +155,10 @@ define amdgpu_kernel void @workgroup_release_fence() {
;
; GFX12-CU-LABEL: workgroup_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_release_fence:
@@ -181,6 +189,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
;
; GFX10-CU-LABEL: workgroup_acq_rel_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_acq_rel_fence:
@@ -216,6 +226,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
;
; GFX11-CU-LABEL: workgroup_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_acq_rel_fence:
@@ -229,6 +241,10 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
;
; GFX12-CU-LABEL: workgroup_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_acq_rel_fence:
@@ -259,6 +275,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
;
; GFX10-CU-LABEL: workgroup_seq_cst_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_seq_cst_fence:
@@ -294,6 +312,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
;
; GFX11-CU-LABEL: workgroup_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_seq_cst_fence:
@@ -307,6 +327,10 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
;
; GFX12-CU-LABEL: workgroup_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_seq_cst_fence:
@@ -412,6 +436,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
;
; GFX10-CU-LABEL: workgroup_one_as_release_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence:
@@ -444,6 +470,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
;
; GFX11-CU-LABEL: workgroup_one_as_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_one_as_release_fence:
@@ -456,6 +484,10 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
;
; GFX12-CU-LABEL: workgroup_one_as_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_one_as_release_fence:
@@ -486,6 +518,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
;
; GFX10-CU-LABEL: workgroup_one_as_acq_rel_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence:
@@ -521,6 +555,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
;
; GFX11-CU-LABEL: workgroup_one_as_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence:
@@ -534,6 +570,10 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
;
; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_one_as_acq_rel_fence:
@@ -564,6 +604,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
;
; GFX10-CU-LABEL: workgroup_one_as_seq_cst_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence:
@@ -599,6 +641,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
;
; GFX11-CU-LABEL: workgroup_one_as_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence:
@@ -612,6 +656,10 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
;
; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_one_as_seq_cst_fence:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
index d288bfc..1cca64a 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
@@ -1093,7 +1093,8 @@ define amdgpu_kernel void @workgroup_release_fence() {
;
; GFX10-CU-LABEL: workgroup_release_fence:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_release_fence:
@@ -1129,7 +1130,8 @@ define amdgpu_kernel void @workgroup_release_fence() {
;
; GFX11-CU-LABEL: workgroup_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_release_fence:
@@ -1142,7 +1144,10 @@ define amdgpu_kernel void @workgroup_release_fence() {
;
; GFX12-CU-LABEL: workgroup_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_release_fence:
@@ -1175,7 +1180,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
;
; GFX10-CU-LABEL: workgroup_acq_rel_fence:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_acq_rel_fence:
@@ -1214,7 +1220,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
;
; GFX11-CU-LABEL: workgroup_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_acq_rel_fence:
@@ -1228,7 +1235,10 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
;
; GFX12-CU-LABEL: workgroup_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_acq_rel_fence:
@@ -1261,7 +1271,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
;
; GFX10-CU-LABEL: workgroup_seq_cst_fence:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_seq_cst_fence:
@@ -1300,7 +1311,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
;
; GFX11-CU-LABEL: workgroup_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_seq_cst_fence:
@@ -1314,7 +1326,10 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
;
; GFX12-CU-LABEL: workgroup_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_seq_cst_fence:
@@ -1420,6 +1435,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
;
; GFX10-CU-LABEL: workgroup_one_as_release_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence:
@@ -1452,6 +1469,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
;
; GFX11-CU-LABEL: workgroup_one_as_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_one_as_release_fence:
@@ -1464,6 +1483,10 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
;
; GFX12-CU-LABEL: workgroup_one_as_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_one_as_release_fence:
@@ -1494,6 +1517,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
;
; GFX10-CU-LABEL: workgroup_one_as_acq_rel_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence:
@@ -1529,6 +1554,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
;
; GFX11-CU-LABEL: workgroup_one_as_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence:
@@ -1542,6 +1569,10 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
;
; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_one_as_acq_rel_fence:
@@ -1572,6 +1603,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
;
; GFX10-CU-LABEL: workgroup_one_as_seq_cst_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence:
@@ -1607,6 +1640,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
;
; GFX11-CU-LABEL: workgroup_one_as_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence:
@@ -1620,6 +1655,10 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
;
; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_one_as_seq_cst_fence:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index d277441..2afa577 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -1072,7 +1072,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -1109,7 +1110,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -1136,7 +1138,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index 3826953..d384aec 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -656,12 +656,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -765,12 +765,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -800,12 +800,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -1193,7 +1195,8 @@ define amdgpu_kernel void @flat_workgroup_release_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -1278,7 +1281,8 @@ define amdgpu_kernel void @flat_workgroup_release_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -1305,7 +1309,10 @@ define amdgpu_kernel void @flat_workgroup_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -1372,7 +1379,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -1457,7 +1465,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -1484,7 +1493,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -1891,7 +1903,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -1976,7 +1989,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -2003,7 +2017,10 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -2074,9 +2091,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_atomicrmw:
@@ -2170,9 +2189,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
@@ -2200,9 +2221,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_acq_rel_atomicrmw:
@@ -2273,9 +2297,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_atomicrmw:
@@ -2369,9 +2395,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
@@ -2399,9 +2427,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_seq_cst_atomicrmw:
@@ -2697,12 +2728,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -2813,12 +2844,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -2850,12 +2881,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -2935,12 +2968,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -3051,12 +3084,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -3088,12 +3121,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -3731,7 +3766,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
@@ -3854,7 +3890,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -3889,7 +3926,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -4007,9 +4047,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4141,9 +4183,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4179,9 +4223,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4299,9 +4346,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4433,9 +4482,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4471,9 +4522,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
@@ -5137,9 +5191,11 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_release_acquire_cmpxchg:
@@ -5271,9 +5327,11 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
@@ -5309,9 +5367,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_release_acquire_cmpxchg:
@@ -5429,9 +5490,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
@@ -5563,9 +5626,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
@@ -5601,9 +5666,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
@@ -5721,9 +5789,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
@@ -5855,9 +5925,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
@@ -5893,9 +5965,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
@@ -6013,9 +6088,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6147,9 +6224,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6185,9 +6264,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6923,7 +7005,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
@@ -7070,7 +7153,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
@@ -7113,7 +7197,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
@@ -7245,12 +7332,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -7399,12 +7486,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -7444,12 +7531,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -7577,12 +7666,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -7731,12 +7820,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -7776,12 +7865,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -8535,12 +8626,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -8689,12 +8780,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -8734,12 +8825,14 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -8867,12 +8960,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -9021,12 +9114,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -9066,12 +9159,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -9199,12 +9294,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -9353,12 +9448,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -9398,12 +9493,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -9531,7 +9628,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9685,7 +9783,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9730,7 +9829,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9863,7 +9965,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -10017,7 +10120,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -10062,7 +10166,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -10195,12 +10302,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -10349,12 +10456,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -10394,12 +10501,14 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -10527,12 +10636,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -10681,12 +10790,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -10726,12 +10835,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -10859,12 +10970,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -11013,12 +11124,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -11058,12 +11169,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -11732,10 +11845,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -11834,10 +11950,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -11868,10 +11987,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -12258,6 +12382,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -12339,6 +12465,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -12365,6 +12493,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -12430,6 +12562,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -12511,6 +12645,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -12537,6 +12673,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -12933,6 +13073,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -13014,6 +13156,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -13040,6 +13184,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -13107,7 +13255,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
@@ -13194,7 +13345,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
@@ -13222,7 +13376,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
@@ -13290,7 +13449,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
@@ -13377,7 +13539,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
@@ -13405,7 +13570,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
@@ -13696,10 +13866,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -13805,10 +13978,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -13841,10 +14017,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -13923,10 +14104,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -14032,10 +14216,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -14068,10 +14255,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -14699,6 +14891,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
@@ -14818,6 +15012,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -14852,6 +15048,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -14966,7 +15166,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15091,7 +15294,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15127,7 +15333,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15242,7 +15453,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -15367,7 +15581,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -15403,7 +15620,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -16046,7 +16268,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
@@ -16171,7 +16396,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
@@ -16207,7 +16435,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
@@ -16322,7 +16555,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16447,7 +16683,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16483,7 +16722,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16598,7 +16842,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16723,7 +16970,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16759,7 +17009,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16874,6 +17129,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
@@ -16999,6 +17256,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -17035,6 +17294,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -17150,6 +17413,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
@@ -17275,6 +17540,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -17311,6 +17578,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -17426,7 +17697,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17551,7 +17825,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17587,7 +17864,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17702,7 +17984,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17827,7 +18112,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17863,7 +18151,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17978,7 +18271,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18103,7 +18399,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18139,7 +18438,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18870,6 +19174,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
@@ -19013,6 +19319,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
@@ -19055,6 +19363,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
@@ -19185,10 +19497,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -19332,10 +19647,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -19376,10 +19694,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -19506,10 +19829,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -19653,10 +19979,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -19697,10 +20026,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -20445,10 +20779,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -20592,10 +20929,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -20636,10 +20976,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -20766,10 +21111,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -20913,10 +21261,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -20957,10 +21308,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -21087,10 +21443,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -21234,10 +21593,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -21278,10 +21640,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -21408,6 +21775,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
@@ -21555,6 +21924,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
@@ -21599,6 +21970,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
@@ -21729,6 +22104,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
@@ -21876,6 +22253,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
@@ -21920,6 +22299,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
@@ -22050,10 +22433,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -22197,10 +22583,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -22241,10 +22630,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -22371,10 +22765,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -22518,10 +22915,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -22562,10 +22962,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -22692,10 +23097,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -22839,10 +23247,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -22883,10 +23294,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index 3bf5ed8..c326edf 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -959,7 +959,8 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
@@ -1001,7 +1002,8 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
@@ -1026,7 +1028,10 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index b755c5d..868b438 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -667,7 +667,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -763,7 +764,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -790,7 +792,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -1204,7 +1209,8 @@ define amdgpu_kernel void @global_workgroup_release_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
@@ -1290,7 +1296,8 @@ define amdgpu_kernel void @global_workgroup_release_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
@@ -1315,7 +1322,10 @@ define amdgpu_kernel void @global_workgroup_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
@@ -1391,7 +1401,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
@@ -1477,7 +1488,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
@@ -1502,7 +1514,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
@@ -1918,7 +1933,8 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
@@ -2003,7 +2019,8 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
@@ -2028,7 +2045,10 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
@@ -2105,8 +2125,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_atomicrmw:
@@ -2196,8 +2218,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_acq_rel_atomicrmw:
@@ -2223,8 +2247,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_acq_rel_atomicrmw:
@@ -2301,8 +2329,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_atomicrmw:
@@ -2392,8 +2422,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_seq_cst_atomicrmw:
@@ -2419,8 +2451,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_seq_cst_atomicrmw:
@@ -2705,7 +2741,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -2807,7 +2844,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -2837,7 +2875,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -2926,7 +2967,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -3028,7 +3070,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -3058,7 +3101,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -3644,7 +3690,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -3758,7 +3805,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -3791,7 +3839,10 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -3900,8 +3951,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4020,8 +4073,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4055,8 +4110,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4165,8 +4224,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4285,8 +4346,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4320,8 +4383,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4920,8 +4987,10 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_release_acquire_cmpxchg:
@@ -5040,8 +5109,10 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_release_acquire_cmpxchg:
@@ -5075,8 +5146,12 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_release_acquire_cmpxchg:
@@ -5185,8 +5260,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
@@ -5305,8 +5382,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
@@ -5340,8 +5419,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
@@ -5450,8 +5533,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
@@ -5570,8 +5655,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
@@ -5605,8 +5692,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
@@ -5715,7 +5806,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -5835,7 +5927,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -5870,7 +5963,10 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -5980,7 +6076,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -6100,7 +6197,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -6135,7 +6233,10 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -6245,8 +6346,10 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_release_seq_cst_cmpxchg:
@@ -6365,8 +6468,10 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg:
@@ -6400,8 +6505,12 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_release_seq_cst_cmpxchg:
@@ -6510,8 +6619,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
@@ -6630,8 +6741,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
@@ -6665,8 +6778,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
@@ -6775,8 +6892,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6895,8 +7014,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6930,8 +7051,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -7588,7 +7713,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -7717,7 +7843,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -7754,7 +7881,10 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -7877,7 +8007,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -8009,7 +8140,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -8047,7 +8179,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -8170,7 +8305,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -8302,7 +8438,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -8340,7 +8477,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -9009,7 +9149,8 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -9141,7 +9282,8 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -9179,7 +9321,10 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -9302,7 +9447,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -9434,7 +9580,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -9472,7 +9619,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -9595,7 +9745,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -9727,7 +9878,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -9765,7 +9917,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -9888,7 +10043,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -10020,7 +10176,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -10058,7 +10215,10 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -10181,7 +10341,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -10313,7 +10474,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -10351,7 +10513,10 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -10474,7 +10639,8 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -10606,7 +10772,8 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -10644,7 +10811,10 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -10767,7 +10937,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -10899,7 +11070,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -10937,7 +11109,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -11060,7 +11235,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -11192,7 +11368,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -11230,7 +11407,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -11914,7 +12094,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -12009,7 +12190,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -12036,6 +12218,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -12447,6 +12633,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
@@ -12529,6 +12717,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
@@ -12553,6 +12743,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
@@ -12626,6 +12820,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
@@ -12708,6 +12904,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
@@ -12732,6 +12930,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
@@ -13145,6 +13347,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
@@ -13226,6 +13430,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
@@ -13250,6 +13456,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
@@ -13324,7 +13534,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
@@ -13411,7 +13624,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
@@ -13437,7 +13653,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
@@ -13512,7 +13733,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
@@ -13599,7 +13823,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
@@ -13625,7 +13852,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
@@ -13908,6 +14140,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -14006,6 +14240,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -14035,6 +14271,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -14121,6 +14361,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -14219,6 +14461,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -14248,6 +14492,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -14831,6 +15079,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -14941,6 +15191,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -14973,6 +15225,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -15079,7 +15335,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15195,7 +15454,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15229,7 +15491,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15336,7 +15603,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -15452,7 +15722,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -15486,7 +15759,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -16083,7 +16361,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
@@ -16199,7 +16480,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
@@ -16233,7 +16517,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
@@ -16340,7 +16629,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16456,7 +16748,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16490,7 +16785,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16597,7 +16897,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16713,7 +17016,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16747,7 +17053,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16854,6 +17165,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -16970,6 +17283,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -17004,6 +17319,10 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -17111,6 +17430,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -17227,6 +17548,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -17261,6 +17584,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -17368,7 +17695,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17484,7 +17814,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17518,7 +17851,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17625,7 +17963,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17741,7 +18082,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17775,7 +18119,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17882,7 +18231,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -17998,7 +18350,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18032,7 +18387,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18687,6 +19047,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -18812,6 +19174,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -18848,6 +19212,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -18968,6 +19336,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -19096,6 +19466,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -19133,6 +19505,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -19253,6 +19629,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -19381,6 +19759,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -19418,6 +19798,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -20084,6 +20468,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -20212,6 +20598,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -20249,6 +20637,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -20369,6 +20761,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -20497,6 +20891,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -20534,6 +20930,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -20654,6 +21054,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -20782,6 +21184,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -20819,6 +21223,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -20939,6 +21347,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -21067,6 +21477,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -21104,6 +21516,10 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -21224,6 +21640,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -21352,6 +21770,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -21389,6 +21809,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -21509,6 +21933,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -21637,6 +22063,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -21674,6 +22102,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -21794,6 +22226,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -21922,6 +22356,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -21959,6 +22395,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -22079,6 +22519,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -22207,6 +22649,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -22244,6 +22688,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
index 986b48b..712109d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
@@ -622,7 +622,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_read_b32 v1, v0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -719,7 +720,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_load_b32 v1, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -749,7 +751,10 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_load_b32 v1, v0
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_agent_release_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_agent_release_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_agent_release_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: s_wait_dscnt 0x0
@@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll
index 8926893..6d1e4e6 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll
@@ -622,7 +622,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_load(
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_read_b32 v1, v0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -719,7 +720,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_load(
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_load_b32 v1, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -749,7 +751,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_load(
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_load_b32 v1, v0
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_cluster_release_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_cluster_release_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_cluster_release_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: s_wait_dscnt 0x0
@@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
index 81bbe0a..577d2ca 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
@@ -622,7 +622,8 @@ define amdgpu_kernel void @local_system_seq_cst_load(
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_read_b32 v1, v0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -719,7 +720,8 @@ define amdgpu_kernel void @local_system_seq_cst_load(
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_load_b32 v1, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -749,7 +751,10 @@ define amdgpu_kernel void @local_system_seq_cst_load(
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_load_b32 v1, v0
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_system_release_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_system_release_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_system_release_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_system_seq_cst_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_system_seq_cst_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_system_seq_cst_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: s_wait_dscnt 0x0
@@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index 980141a..d686e7a 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -819,7 +819,8 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -854,7 +855,8 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -879,7 +881,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
index 6a233a2..ab4d783 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
@@ -622,7 +622,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_read_b32 v1, v0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -719,7 +720,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_load_b32 v1, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -749,7 +751,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_load_b32 v1, v0
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_workgroup_release_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_workgroup_release_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_workgroup_release_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: s_wait_dscnt 0x0
@@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
index d578d2e..60570bd 100644
--- a/llvm/test/CodeGen/AMDGPU/minmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -1296,4 +1296,4 @@ declare half @llvm.minnum.f16(half, half)
declare half @llvm.maxnum.f16(half, half)
declare float @llvm.minnum.f32(float, float)
declare float @llvm.maxnum.f32(float, float)
-attributes #0 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
+attributes #0 = { nounwind "no-nans-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
index c1cf06e..fba42c4 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
@@ -388,9 +388,8 @@ body: |
; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY]], implicit-def $scc
- ; GCN-NEXT: S_NOP 0, implicit killed $scc
- ; GCN-NEXT: S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc
+ ; GCN-NEXT: S_BITCMP1_B32 killed [[COPY]], 0, implicit-def $scc
+ ; GCN-NEXT: S_NOP 0, implicit $scc
; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
; GCN-NEXT: S_BRANCH %bb.1
; GCN-NEXT: {{ $}}
@@ -417,6 +416,80 @@ body: |
S_ENDPGM 0
...
+---
+name: xor_1_cmp_lg_0_killed_scc
+body: |
+ ; GCN-LABEL: name: xor_1_cmp_lg_0_killed_scc
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GCN-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 1, killed [[COPY]], implicit-def $scc
+ ; GCN-NEXT: S_NOP 0, implicit $scc
+ ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
+ ; GCN-NEXT: S_BRANCH %bb.1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: $sgpr0, $vgpr0_vgpr1
+
+ %0:sreg_32 = COPY $sgpr0
+ %1:sreg_32 = S_XOR_B32 1, killed %0, implicit-def $scc
+ S_NOP 0, implicit killed $scc
+ S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc
+ S_CBRANCH_SCC0 %bb.2, implicit $scc
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ bb.2:
+ S_ENDPGM 0
+
+...
+---
+name: absdiff_1_cmp_lg_0_killed_scc
+body: |
+ ; GCN-LABEL: name: absdiff_1_cmp_lg_0_killed_scc
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GCN-NEXT: [[S_ABSDIFF_I32_:%[0-9]+]]:sreg_32 = S_ABSDIFF_I32 1, killed [[COPY]], implicit-def $scc
+ ; GCN-NEXT: S_NOP 0, implicit $scc
+ ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
+ ; GCN-NEXT: S_BRANCH %bb.1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: $sgpr0, $vgpr0_vgpr1
+
+ %0:sreg_32 = COPY $sgpr0
+ %1:sreg_32 = S_ABSDIFF_I32 1, killed %0, implicit-def $scc
+ S_NOP 0, implicit killed $scc
+ S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc
+ S_CBRANCH_SCC0 %bb.2, implicit $scc
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ bb.2:
+ S_ENDPGM 0
+
+...
---
name: and_1_cmp_eq_1_clobbered_scc
@@ -2070,8 +2143,7 @@ body: |
; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def dead $scc
- ; GCN-NEXT: S_CMP_LG_U32 killed [[S_AND_B32_]], 0, implicit-def $scc
+ ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def $scc
; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
; GCN-NEXT: S_BRANCH %bb.1
; GCN-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll b/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll
index ef3e04c..6ce614b 100644
--- a/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S -passes='simplifycfg<hoist-common-insts>' -mtriple=amdgcn-- --fp-contract=fast -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX -check-prefix=FP-CONTRACT-FAST %s
-; RUN: opt -S -passes='simplifycfg<hoist-common-insts>' -mtriple=amdgcn-- --fp-contract=off --enable-unsafe-fp-math -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX -check-prefix=UNSAFE-FP-MATH %s
+; RUN: opt -S -passes='simplifycfg<hoist-common-insts>' -mtriple=amdgcn-- --fp-contract=off -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX -check-prefix=UNSAFE-FP-MATH %s
; RUN: opt -S -passes='simplifycfg<hoist-common-insts>' -mtriple=amdgcn-- --fp-contract=off -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX -check-prefix=NO-UNSAFE-FP-MATH %s
define double @is_profitable_f64_contract(ptr dereferenceable(8) %ptr_x, ptr dereferenceable(8) %ptr_y, ptr dereferenceable(8) %ptr_a) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index f53aaaa..dd5f838 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
declare i32 @llvm.ctpop.i32(i32)
declare i64 @llvm.ctpop.i64(i64)
@@ -10,7 +10,6 @@ define amdgpu_ps i32 @shl32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: shl32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_lshl_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -25,7 +24,6 @@ define amdgpu_ps i32 @shl64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: shl64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -40,7 +38,6 @@ define amdgpu_ps i32 @lshr32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: lshr32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_lshr_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -55,7 +52,6 @@ define amdgpu_ps i32 @lshr64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: lshr64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -70,7 +66,6 @@ define amdgpu_ps i32 @ashr32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: ashr32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_ashr_i32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -85,7 +80,6 @@ define amdgpu_ps i32 @ashr64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: ashr64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_ashr_i64 s[0:1], s[0:1], s2
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -100,7 +94,6 @@ define amdgpu_ps i32 @abs32(i32 inreg %val0) {
; CHECK-LABEL: abs32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_abs_i32 s0, s0
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -121,7 +114,6 @@ define amdgpu_ps i32 @and32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: and32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -136,7 +128,6 @@ define amdgpu_ps i32 @and64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: and64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -151,7 +142,6 @@ define amdgpu_ps i32 @or32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: or32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_or_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -166,7 +156,6 @@ define amdgpu_ps i32 @or64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: or64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -181,7 +170,6 @@ define amdgpu_ps i32 @xor32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: xor32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_xor_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -196,7 +184,6 @@ define amdgpu_ps i32 @xor64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: xor64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -211,7 +198,6 @@ define amdgpu_ps i32 @nand32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: nand32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_nand_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -231,7 +217,6 @@ define amdgpu_ps i32 @nand64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: nand64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -251,7 +236,6 @@ define amdgpu_ps i32 @nor32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: nor32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_nor_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -271,7 +255,6 @@ define amdgpu_ps i32 @nor64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: nor64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -291,7 +274,6 @@ define amdgpu_ps i32 @xnor32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: xnor32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_xnor_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -311,7 +293,6 @@ define amdgpu_ps i32 @xnor64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: xnor64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_xnor_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -331,7 +312,6 @@ define amdgpu_ps i32 @andn232(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: andn232:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_andn2_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -347,7 +327,6 @@ define amdgpu_ps i32 @nandn264(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: nandn264:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -363,7 +342,6 @@ define amdgpu_ps i32 @orn232(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: orn232:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_orn2_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -379,7 +357,6 @@ define amdgpu_ps i32 @orn264(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: orn264:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_orn2_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -395,7 +372,6 @@ define amdgpu_ps i32 @bfe_i32(i32 inreg %val0) {
; CHECK-LABEL: bfe_i32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_bfe_i32 s0, s0, 0x80010
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -433,7 +409,6 @@ define amdgpu_ps i32 @bfe_u32(i32 inreg %val0) {
; CHECK-LABEL: bfe_u32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_bfe_u32 s0, s0, 0x80010
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -513,7 +488,6 @@ define amdgpu_ps i32 @bcnt132(i32 inreg %val0) {
; CHECK-LABEL: bcnt132:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -552,7 +526,6 @@ define amdgpu_ps i32 @quadmask32(i32 inreg %val0) {
; CHECK-LABEL: quadmask32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_quadmask_b32 s0, s0
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -571,7 +544,6 @@ define amdgpu_ps i32 @quadmask64(i64 inreg %val0) {
; CHECK-LABEL: quadmask64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_quadmask_b64 s[0:1], s[0:1]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -590,7 +562,6 @@ define amdgpu_ps i32 @not32(i32 inreg %val0) {
; CHECK-LABEL: not32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_not_b32 s0, s0
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -609,7 +580,6 @@ define amdgpu_ps i32 @not64(i64 inreg %val0) {
; CHECK-LABEL: not64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_not_b64 s[0:1], s[0:1]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -623,3 +593,35 @@ define amdgpu_ps i32 @not64(i64 inreg %val0) {
%zext = zext i1 %cmp to i32
ret i32 %zext
}
+
+
+; --------------------------------------------------------------------------------
+; Negative tests
+; --------------------------------------------------------------------------------
+
+@1 = extern_weak dso_local addrspace(4) constant i32
+
+define amdgpu_ps i32 @si_pc_add_rel_offset_must_not_optimize() {
+; CHECK-LABEL: si_pc_add_rel_offset_must_not_optimize:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_getpc_b64 s[0:1]
+; CHECK-NEXT: s_add_u32 s0, s0, __unnamed_1@rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s1, s1, __unnamed_1@rel32@hi+12
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB35_2
+; CHECK-NEXT: ; %bb.1: ; %endif
+; CHECK-NEXT: s_mov_b32 s0, 1
+; CHECK-NEXT: s_branch .LBB35_3
+; CHECK-NEXT: .LBB35_2: ; %if
+; CHECK-NEXT: s_mov_b32 s0, 0
+; CHECK-NEXT: s_branch .LBB35_3
+; CHECK-NEXT: .LBB35_3:
+ %cmp = icmp ne ptr addrspace(4) @1, null
+ br i1 %cmp, label %endif, label %if
+
+if:
+ ret i32 0
+
+endif:
+ ret i32 1
+}
diff --git a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
index a828ee0..7552f6b 100644
--- a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
@@ -12,8 +12,6 @@ define amdgpu_ps i32 @s_uaddo_pseudo(i32 inreg %val0) {
; CHECK-LABEL: s_uaddo_pseudo:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s0, s0, 1
-; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_addc_u32 s0, 1, 0
; CHECK-NEXT: ; return to shader part epilog
%pair = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %val0, i32 1)
@@ -32,8 +30,6 @@ define amdgpu_ps i32 @s_usubo_pseudo(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: s_usubo_pseudo:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_sub_u32 s0, s0, 1
-; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_subb_u32 s0, s1, 0
; CHECK-NEXT: ; return to shader part epilog
%pair = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %val0, i32 1)
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir
new file mode 100644
index 0000000..93f7bcc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir
@@ -0,0 +1,83 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -passes=postmisched -o - %s | FileCheck %s
+
+# Ensure WMMA operations stay before the final atomic fence and barrier group.
+# This allows the latency of the WMMA operations to be hidden by barrier wait.
+---
+name: test
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr12, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8, $vgpr9_vgpr10_vgpr11_vgpr12, $vgpr13_vgpr14_vgpr15_vgpr16, $vgpr17_vgpr18_vgpr19_vgpr20, $vgpr21_vgpr22_vgpr23_vgpr24, $vgpr25_vgpr26_vgpr27_vgpr28, $vgpr29_vgpr30_vgpr31_vgpr32
+
+ ; CHECK-LABEL: name: test
+ ; CHECK: liveins: $sgpr0, $sgpr12, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8, $vgpr9_vgpr10_vgpr11_vgpr12, $vgpr13_vgpr14_vgpr15_vgpr16, $vgpr17_vgpr18_vgpr19_vgpr20, $vgpr21_vgpr22_vgpr23_vgpr24, $vgpr25_vgpr26_vgpr27_vgpr28, $vgpr29_vgpr30_vgpr31_vgpr32
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: ATOMIC_FENCE 5, 2
+ ; CHECK-NEXT: S_BARRIER
+ ; CHECK-NEXT: ATOMIC_FENCE 4, 2
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr41_vgpr42_vgpr43_vgpr44, implicit-def $vgpr41, implicit-def $vgpr41_lo16, implicit-def $vgpr41_hi16, implicit-def $vgpr42, implicit-def $vgpr42_lo16, implicit-def $vgpr42_hi16, implicit-def $vgpr43, implicit-def $vgpr43_lo16, implicit-def $vgpr43_hi16, implicit-def $vgpr44, implicit-def $vgpr44_lo16, implicit-def $vgpr44_hi16, implicit-def $vgpr41_vgpr42, implicit-def $vgpr41_vgpr42_vgpr43, implicit-def $vgpr42_vgpr43, implicit-def $vgpr42_vgpr43_vgpr44, implicit-def $vgpr43_vgpr44, implicit-def $vgpr45_vgpr46_vgpr47_vgpr48, implicit-def $vgpr45, implicit-def $vgpr45_lo16, implicit-def $vgpr45_hi16, implicit-def $vgpr46, implicit-def $vgpr46_lo16, implicit-def $vgpr46_hi16, implicit-def $vgpr47, implicit-def $vgpr47_lo16, implicit-def $vgpr47_hi16, implicit-def $vgpr48, implicit-def $vgpr48_lo16, implicit-def $vgpr48_hi16, implicit-def $vgpr45_vgpr46, implicit-def $vgpr45_vgpr46_vgpr47, implicit-def $vgpr46_vgpr47, implicit-def $vgpr46_vgpr47_vgpr48, implicit-def $vgpr47_vgpr48, implicit-def $vgpr49_vgpr50_vgpr51_vgpr52, implicit-def $vgpr49, implicit-def $vgpr49_lo16, implicit-def $vgpr49_hi16, implicit-def $vgpr50, implicit-def $vgpr50_lo16, implicit-def $vgpr50_hi16, implicit-def $vgpr51, implicit-def $vgpr51_lo16, implicit-def $vgpr51_hi16, implicit-def $vgpr52, implicit-def $vgpr52_lo16, implicit-def $vgpr52_hi16, implicit-def $vgpr49_vgpr50, implicit-def $vgpr49_vgpr50_vgpr51, implicit-def $vgpr50_vgpr51, implicit-def $vgpr50_vgpr51_vgpr52, implicit-def $vgpr51_vgpr52, implicit-def $vgpr53_vgpr54_vgpr55_vgpr56, implicit-def $vgpr53, implicit-def $vgpr53_lo16, implicit-def $vgpr53_hi16, implicit-def $vgpr54, implicit-def $vgpr54_lo16, implicit-def $vgpr54_hi16, implicit-def $vgpr55, implicit-def $vgpr55_lo16, implicit-def $vgpr55_hi16, implicit-def $vgpr56, implicit-def $vgpr56_lo16, implicit-def $vgpr56_hi16, implicit-def $vgpr53_vgpr54, implicit-def $vgpr53_vgpr54_vgpr55, implicit-def $vgpr54_vgpr55, implicit-def $vgpr54_vgpr55_vgpr56, implicit-def $vgpr55_vgpr56, implicit-def $vgpr57_vgpr58_vgpr59_vgpr60, implicit-def $vgpr57, implicit-def $vgpr57_lo16, implicit-def $vgpr57_hi16, implicit-def $vgpr58, implicit-def $vgpr58_lo16, implicit-def $vgpr58_hi16, implicit-def $vgpr59, implicit-def $vgpr59_lo16, implicit-def $vgpr59_hi16, implicit-def $vgpr60, implicit-def $vgpr60_lo16, implicit-def $vgpr60_hi16, implicit-def $vgpr57_vgpr58, implicit-def $vgpr57_vgpr58_vgpr59, implicit-def $vgpr58_vgpr59, implicit-def $vgpr58_vgpr59_vgpr60, implicit-def $vgpr59_vgpr60, implicit-def $vgpr61_vgpr62_vgpr63_vgpr64, implicit-def $vgpr61, implicit-def $vgpr61_lo16, implicit-def $vgpr61_hi16, implicit-def $vgpr62, implicit-def $vgpr62_lo16, implicit-def $vgpr62_hi16, implicit-def $vgpr63, implicit-def $vgpr63_lo16, implicit-def $vgpr63_hi16, implicit-def $vgpr64, implicit-def $vgpr64_lo16, implicit-def $vgpr64_hi16, implicit-def $vgpr61_vgpr62, implicit-def $vgpr61_vgpr62_vgpr63, implicit-def $vgpr62_vgpr63, implicit-def $vgpr62_vgpr63_vgpr64, implicit-def $vgpr63_vgpr64, implicit-def $vgpr65_vgpr66_vgpr67_vgpr68, implicit-def $vgpr65, implicit-def $vgpr65_lo16, implicit-def $vgpr65_hi16, implicit-def $vgpr66, implicit-def $vgpr66_lo16, implicit-def $vgpr66_hi16, implicit-def $vgpr67, implicit-def $vgpr67_lo16, implicit-def $vgpr67_hi16, implicit-def $vgpr68, implicit-def $vgpr68_lo16, implicit-def $vgpr68_hi16, implicit-def $vgpr65_vgpr66, implicit-def $vgpr65_vgpr66_vgpr67, implicit-def $vgpr66_vgpr67, implicit-def $vgpr66_vgpr67_vgpr68, implicit-def $vgpr67_vgpr68, implicit-def $vgpr69_vgpr70_vgpr71_vgpr72, implicit-def $vgpr69, implicit-def $vgpr69_lo16, implicit-def $vgpr69_hi16, implicit-def $vgpr70, implicit-def $vgpr70_lo16, implicit-def $vgpr70_hi16, implicit-def $vgpr71, implicit-def $vgpr71_lo16, implicit-def $vgpr71_hi16, implicit-def $vgpr72, implicit-def $vgpr72_lo16, implicit-def $vgpr72_hi16, implicit-def $vgpr69_vgpr70, implicit-def $vgpr69_vgpr70_vgpr71, implicit-def $vgpr70_vgpr71, implicit-def $vgpr70_vgpr71_vgpr72, implicit-def $vgpr71_vgpr72, implicit-def $vgpr73_vgpr74_vgpr75_vgpr76, implicit-def $vgpr73, implicit-def $vgpr73_lo16, implicit-def $vgpr73_hi16, implicit-def $vgpr74, implicit-def $vgpr74_lo16, implicit-def $vgpr74_hi16, implicit-def $vgpr75, implicit-def $vgpr75_lo16, implicit-def $vgpr75_hi16, implicit-def $vgpr76, implicit-def $vgpr76_lo16, implicit-def $vgpr76_hi16, implicit-def $vgpr73_vgpr74, implicit-def $vgpr73_vgpr74_vgpr75, implicit-def $vgpr74_vgpr75, implicit-def $vgpr74_vgpr75_vgpr76, implicit-def $vgpr75_vgpr76, implicit-def $vgpr77_vgpr78_vgpr79_vgpr80, implicit-def $vgpr77, implicit-def $vgpr77_lo16, implicit-def $vgpr77_hi16, implicit-def $vgpr78, implicit-def $vgpr78_lo16, implicit-def $vgpr78_hi16, implicit-def $vgpr79, implicit-def $vgpr79_lo16, implicit-def $vgpr79_hi16, implicit-def $vgpr80, implicit-def $vgpr80_lo16, implicit-def $vgpr80_hi16, implicit-def $vgpr77_vgpr78, implicit-def $vgpr77_vgpr78_vgpr79, implicit-def $vgpr78_vgpr79, implicit-def $vgpr78_vgpr79_vgpr80, implicit-def $vgpr79_vgpr80, implicit-def $vgpr81_vgpr82_vgpr83_vgpr84, implicit-def $vgpr81, implicit-def $vgpr81_lo16, implicit-def $vgpr81_hi16, implicit-def $vgpr82, implicit-def $vgpr82_lo16, implicit-def $vgpr82_hi16, implicit-def $vgpr83, implicit-def $vgpr83_lo16, implicit-def $vgpr83_hi16, implicit-def $vgpr84, implicit-def $vgpr84_lo16, implicit-def $vgpr84_hi16, implicit-def $vgpr81_vgpr82, implicit-def $vgpr81_vgpr82_vgpr83, implicit-def $vgpr82_vgpr83, implicit-def $vgpr82_vgpr83_vgpr84, implicit-def $vgpr83_vgpr84, implicit-def $vgpr85_vgpr86_vgpr87_vgpr88, implicit-def $vgpr85, implicit-def $vgpr85_lo16, implicit-def $vgpr85_hi16, implicit-def $vgpr86, implicit-def $vgpr86_lo16, implicit-def $vgpr86_hi16, implicit-def $vgpr87, implicit-def $vgpr87_lo16, implicit-def $vgpr87_hi16, implicit-def $vgpr88, implicit-def $vgpr88_lo16, implicit-def $vgpr88_hi16, implicit-def $vgpr85_vgpr86, implicit-def $vgpr85_vgpr86_vgpr87, implicit-def $vgpr86_vgpr87, implicit-def $vgpr86_vgpr87_vgpr88, implicit-def $vgpr87_vgpr88, implicit killed $vgpr36, implicit $exec, implicit killed $vgpr37, implicit killed $vgpr38 {
+ ; CHECK-NEXT: $vgpr41_vgpr42_vgpr43_vgpr44 = DS_READ_B128_gfx9 $vgpr36, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr45_vgpr46_vgpr47_vgpr48 = DS_READ2_B64_gfx9 $vgpr36, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr49_vgpr50_vgpr51_vgpr52 = DS_READ_B128_gfx9 $vgpr37, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr53_vgpr54_vgpr55_vgpr56 = DS_READ2_B64_gfx9 $vgpr37, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr57_vgpr58_vgpr59_vgpr60 = DS_READ_B128_gfx9 $vgpr37, 768, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr61_vgpr62_vgpr63_vgpr64 = DS_READ2_B64_gfx9 killed $vgpr37, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr65_vgpr66_vgpr67_vgpr68 = DS_READ_B128_gfx9 $vgpr36, 768, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr69_vgpr70_vgpr71_vgpr72 = DS_READ2_B64_gfx9 $vgpr36, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr73_vgpr74_vgpr75_vgpr76 = DS_READ_B128_gfx9 $vgpr36, 1536, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr77_vgpr78_vgpr79_vgpr80 = DS_READ2_B64_gfx9 $vgpr36, 194, 195, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr81_vgpr82_vgpr83_vgpr84 = DS_READ_B128_gfx9 killed $vgpr36, 2304, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr85_vgpr86_vgpr87_vgpr88 = DS_READ2_B64_gfx9 killed $vgpr38, 0, 1, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: $sgpr1 = S_ADD_I32 $sgpr0, 16, implicit-def dead $scc
+ ; CHECK-NEXT: $vgpr39 = V_ADD_U32_e32 32, killed $vgpr39, implicit $exec
+ ; CHECK-NEXT: $vgpr40 = V_ADD_U32_e32 32, killed $vgpr40, implicit $exec
+ ; CHECK-NEXT: S_CMP_LT_U32 killed $sgpr0, killed $sgpr12, implicit-def $scc
+ ; CHECK-NEXT: $sgpr0 = S_MOV_B32 killed $sgpr1
+ ; CHECK-NEXT: early-clobber $vgpr29_vgpr30_vgpr31_vgpr32 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr29_vgpr30_vgpr31_vgpr32, 0, 0, implicit $exec
+ ; CHECK-NEXT: early-clobber $vgpr25_vgpr26_vgpr27_vgpr28 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr25_vgpr26_vgpr27_vgpr28, 0, 0, implicit $exec
+ ; CHECK-NEXT: early-clobber $vgpr21_vgpr22_vgpr23_vgpr24 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr21_vgpr22_vgpr23_vgpr24, 0, 0, implicit $exec
+ ; CHECK-NEXT: early-clobber $vgpr17_vgpr18_vgpr19_vgpr20 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr17_vgpr18_vgpr19_vgpr20, 0, 0, implicit $exec
+ ; CHECK-NEXT: early-clobber $vgpr13_vgpr14_vgpr15_vgpr16 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr13_vgpr14_vgpr15_vgpr16, 0, 0, implicit $exec
+ ; CHECK-NEXT: early-clobber $vgpr9_vgpr10_vgpr11_vgpr12 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr9_vgpr10_vgpr11_vgpr12, 0, 0, implicit $exec
+ ; CHECK-NEXT: early-clobber $vgpr5_vgpr6_vgpr7_vgpr8 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr5_vgpr6_vgpr7_vgpr8, 0, 0, implicit $exec
+ ; CHECK-NEXT: early-clobber $vgpr1_vgpr2_vgpr3_vgpr4 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr1_vgpr2_vgpr3_vgpr4, 0, 0, implicit $exec
+ ; CHECK-NEXT: ATOMIC_FENCE 5, 2
+ ; CHECK-NEXT: S_BARRIER
+ ; CHECK-NEXT: ATOMIC_FENCE 4, 2
+ ATOMIC_FENCE 5, 2
+ S_BARRIER
+ ATOMIC_FENCE 4, 2
+ BUNDLE implicit-def $vgpr41_vgpr42_vgpr43_vgpr44, implicit-def $vgpr41, implicit-def $vgpr41_lo16, implicit-def $vgpr41_hi16, implicit-def $vgpr42, implicit-def $vgpr42_lo16, implicit-def $vgpr42_hi16, implicit-def $vgpr43, implicit-def $vgpr43_lo16, implicit-def $vgpr43_hi16, implicit-def $vgpr44, implicit-def $vgpr44_lo16, implicit-def $vgpr44_hi16, implicit-def $vgpr41_vgpr42, implicit-def $vgpr41_vgpr42_vgpr43, implicit-def $vgpr42_vgpr43, implicit-def $vgpr42_vgpr43_vgpr44, implicit-def $vgpr43_vgpr44, implicit-def $vgpr45_vgpr46_vgpr47_vgpr48, implicit-def $vgpr45, implicit-def $vgpr45_lo16, implicit-def $vgpr45_hi16, implicit-def $vgpr46, implicit-def $vgpr46_lo16, implicit-def $vgpr46_hi16, implicit-def $vgpr47, implicit-def $vgpr47_lo16, implicit-def $vgpr47_hi16, implicit-def $vgpr48, implicit-def $vgpr48_lo16, implicit-def $vgpr48_hi16, implicit-def $vgpr45_vgpr46, implicit-def $vgpr45_vgpr46_vgpr47, implicit-def $vgpr46_vgpr47, implicit-def $vgpr46_vgpr47_vgpr48, implicit-def $vgpr47_vgpr48, implicit-def $vgpr49_vgpr50_vgpr51_vgpr52, implicit-def $vgpr49, implicit-def $vgpr49_lo16, implicit-def $vgpr49_hi16, implicit-def $vgpr50, implicit-def $vgpr50_lo16, implicit-def $vgpr50_hi16, implicit-def $vgpr51, implicit-def $vgpr51_lo16, implicit-def $vgpr51_hi16, implicit-def $vgpr52, implicit-def $vgpr52_lo16, implicit-def $vgpr52_hi16, implicit-def $vgpr49_vgpr50, implicit-def $vgpr49_vgpr50_vgpr51, implicit-def $vgpr50_vgpr51, implicit-def $vgpr50_vgpr51_vgpr52, implicit-def $vgpr51_vgpr52, implicit-def $vgpr53_vgpr54_vgpr55_vgpr56, implicit-def $vgpr53, implicit-def $vgpr53_lo16, implicit-def $vgpr53_hi16, implicit-def $vgpr54, implicit-def $vgpr54_lo16, implicit-def $vgpr54_hi16, implicit-def $vgpr55, implicit-def $vgpr55_lo16, implicit-def $vgpr55_hi16, implicit-def $vgpr56, implicit-def $vgpr56_lo16, implicit-def $vgpr56_hi16, implicit-def $vgpr53_vgpr54, implicit-def $vgpr53_vgpr54_vgpr55, implicit-def $vgpr54_vgpr55, implicit-def $vgpr54_vgpr55_vgpr56, implicit-def $vgpr55_vgpr56, implicit-def $vgpr57_vgpr58_vgpr59_vgpr60, implicit-def $vgpr57, implicit-def $vgpr57_lo16, implicit-def $vgpr57_hi16, implicit-def $vgpr58, implicit-def $vgpr58_lo16, implicit-def $vgpr58_hi16, implicit-def $vgpr59, implicit-def $vgpr59_lo16, implicit-def $vgpr59_hi16, implicit-def $vgpr60, implicit-def $vgpr60_lo16, implicit-def $vgpr60_hi16, implicit-def $vgpr57_vgpr58, implicit-def $vgpr57_vgpr58_vgpr59, implicit-def $vgpr58_vgpr59, implicit-def $vgpr58_vgpr59_vgpr60, implicit-def $vgpr59_vgpr60, implicit-def $vgpr61_vgpr62_vgpr63_vgpr64, implicit-def $vgpr61, implicit-def $vgpr61_lo16, implicit-def $vgpr61_hi16, implicit-def $vgpr62, implicit-def $vgpr62_lo16, implicit-def $vgpr62_hi16, implicit-def $vgpr63, implicit-def $vgpr63_lo16, implicit-def $vgpr63_hi16, implicit-def $vgpr64, implicit-def $vgpr64_lo16, implicit-def $vgpr64_hi16, implicit-def $vgpr61_vgpr62, implicit-def $vgpr61_vgpr62_vgpr63, implicit-def $vgpr62_vgpr63, implicit-def $vgpr62_vgpr63_vgpr64, implicit-def $vgpr63_vgpr64, implicit-def $vgpr65_vgpr66_vgpr67_vgpr68, implicit-def $vgpr65, implicit-def $vgpr65_lo16, implicit-def $vgpr65_hi16, implicit-def $vgpr66, implicit-def $vgpr66_lo16, implicit-def $vgpr66_hi16, implicit-def $vgpr67, implicit-def $vgpr67_lo16, implicit-def $vgpr67_hi16, implicit-def $vgpr68, implicit-def $vgpr68_lo16, implicit-def $vgpr68_hi16, implicit-def $vgpr65_vgpr66, implicit-def $vgpr65_vgpr66_vgpr67, implicit-def $vgpr66_vgpr67, implicit-def $vgpr66_vgpr67_vgpr68, implicit-def $vgpr67_vgpr68, implicit-def $vgpr69_vgpr70_vgpr71_vgpr72, implicit-def $vgpr69, implicit-def $vgpr69_lo16, implicit-def $vgpr69_hi16, implicit-def $vgpr70, implicit-def $vgpr70_lo16, implicit-def $vgpr70_hi16, implicit-def $vgpr71, implicit-def $vgpr71_lo16, implicit-def $vgpr71_hi16, implicit-def $vgpr72, implicit-def $vgpr72_lo16, implicit-def $vgpr72_hi16, implicit-def $vgpr69_vgpr70, implicit-def $vgpr69_vgpr70_vgpr71, implicit-def $vgpr70_vgpr71, implicit-def $vgpr70_vgpr71_vgpr72, implicit-def $vgpr71_vgpr72, implicit-def $vgpr73_vgpr74_vgpr75_vgpr76, implicit-def $vgpr73, implicit-def $vgpr73_lo16, implicit-def $vgpr73_hi16, implicit-def $vgpr74, implicit-def $vgpr74_lo16, implicit-def $vgpr74_hi16, implicit-def $vgpr75, implicit-def $vgpr75_lo16, implicit-def $vgpr75_hi16, implicit-def $vgpr76, implicit-def $vgpr76_lo16, implicit-def $vgpr76_hi16, implicit-def $vgpr73_vgpr74, implicit-def $vgpr73_vgpr74_vgpr75, implicit-def $vgpr74_vgpr75, implicit-def $vgpr74_vgpr75_vgpr76, implicit-def $vgpr75_vgpr76, implicit-def $vgpr77_vgpr78_vgpr79_vgpr80, implicit-def $vgpr77, implicit-def $vgpr77_lo16, implicit-def $vgpr77_hi16, implicit-def $vgpr78, implicit-def $vgpr78_lo16, implicit-def $vgpr78_hi16, implicit-def $vgpr79, implicit-def $vgpr79_lo16, implicit-def $vgpr79_hi16, implicit-def $vgpr80, implicit-def $vgpr80_lo16, implicit-def $vgpr80_hi16, implicit-def $vgpr77_vgpr78, implicit-def $vgpr77_vgpr78_vgpr79, implicit-def $vgpr78_vgpr79, implicit-def $vgpr78_vgpr79_vgpr80, implicit-def $vgpr79_vgpr80, implicit-def $vgpr81_vgpr82_vgpr83_vgpr84, implicit-def $vgpr81, implicit-def $vgpr81_lo16, implicit-def $vgpr81_hi16, implicit-def $vgpr82, implicit-def $vgpr82_lo16, implicit-def $vgpr82_hi16, implicit-def $vgpr83, implicit-def $vgpr83_lo16, implicit-def $vgpr83_hi16, implicit-def $vgpr84, implicit-def $vgpr84_lo16, implicit-def $vgpr84_hi16, implicit-def $vgpr81_vgpr82, implicit-def $vgpr81_vgpr82_vgpr83, implicit-def $vgpr82_vgpr83, implicit-def $vgpr82_vgpr83_vgpr84, implicit-def $vgpr83_vgpr84, implicit-def $vgpr85_vgpr86_vgpr87_vgpr88, implicit-def $vgpr85, implicit-def $vgpr85_lo16, implicit-def $vgpr85_hi16, implicit-def $vgpr86, implicit-def $vgpr86_lo16, implicit-def $vgpr86_hi16, implicit-def $vgpr87, implicit-def $vgpr87_lo16, implicit-def $vgpr87_hi16, implicit-def $vgpr88, implicit-def $vgpr88_lo16, implicit-def $vgpr88_hi16, implicit-def $vgpr85_vgpr86, implicit-def $vgpr85_vgpr86_vgpr87, implicit-def $vgpr86_vgpr87, implicit-def $vgpr86_vgpr87_vgpr88, implicit-def $vgpr87_vgpr88, implicit $vgpr36, implicit $exec, implicit $vgpr37, implicit $vgpr38 {
+ $vgpr41_vgpr42_vgpr43_vgpr44 = DS_READ_B128_gfx9 $vgpr36, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr45_vgpr46_vgpr47_vgpr48 = DS_READ2_B64_gfx9 $vgpr36, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr49_vgpr50_vgpr51_vgpr52 = DS_READ_B128_gfx9 $vgpr37, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr53_vgpr54_vgpr55_vgpr56 = DS_READ2_B64_gfx9 $vgpr37, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr57_vgpr58_vgpr59_vgpr60 = DS_READ_B128_gfx9 $vgpr37, 768, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr61_vgpr62_vgpr63_vgpr64 = DS_READ2_B64_gfx9 $vgpr37, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr65_vgpr66_vgpr67_vgpr68 = DS_READ_B128_gfx9 $vgpr36, 768, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr69_vgpr70_vgpr71_vgpr72 = DS_READ2_B64_gfx9 $vgpr36, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr73_vgpr74_vgpr75_vgpr76 = DS_READ_B128_gfx9 $vgpr36, 1536, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr77_vgpr78_vgpr79_vgpr80 = DS_READ2_B64_gfx9 $vgpr36, 194, 195, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr81_vgpr82_vgpr83_vgpr84 = DS_READ_B128_gfx9 $vgpr36, 2304, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr85_vgpr86_vgpr87_vgpr88 = DS_READ2_B64_gfx9 $vgpr38, 0, 1, 0, implicit $exec :: (load (s128), addrspace 3)
+ }
+ $sgpr1 = S_ADD_I32 $sgpr0, 16, implicit-def dead $scc
+ $vgpr39 = V_ADD_U32_e32 32, killed $vgpr39, implicit $exec
+ $vgpr40 = V_ADD_U32_e32 32, killed $vgpr40, implicit $exec
+ S_CMP_LT_U32 killed $sgpr0, $sgpr12, implicit-def $scc
+ early-clobber $vgpr29_vgpr30_vgpr31_vgpr32 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr29_vgpr30_vgpr31_vgpr32, 0, 0, implicit $exec
+ early-clobber $vgpr25_vgpr26_vgpr27_vgpr28 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr25_vgpr26_vgpr27_vgpr28, 0, 0, implicit $exec
+ early-clobber $vgpr21_vgpr22_vgpr23_vgpr24 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr21_vgpr22_vgpr23_vgpr24, 0, 0, implicit $exec
+ early-clobber $vgpr17_vgpr18_vgpr19_vgpr20 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr17_vgpr18_vgpr19_vgpr20, 0, 0, implicit $exec
+ early-clobber $vgpr13_vgpr14_vgpr15_vgpr16 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr13_vgpr14_vgpr15_vgpr16, 0, 0, implicit $exec
+ early-clobber $vgpr9_vgpr10_vgpr11_vgpr12 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr9_vgpr10_vgpr11_vgpr12, 0, 0, implicit $exec
+ early-clobber $vgpr5_vgpr6_vgpr7_vgpr8 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr5_vgpr6_vgpr7_vgpr8, 0, 0, implicit $exec
+ early-clobber $vgpr1_vgpr2_vgpr3_vgpr4 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr1_vgpr2_vgpr3_vgpr4, 0, 0, implicit $exec
+ $sgpr0 = S_MOV_B32 killed $sgpr1
+ ATOMIC_FENCE 5, 2
+ S_BARRIER
+ ATOMIC_FENCE 4, 2
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 5f6d622..71f5a94 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -56,10 +56,9 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_addc_u32 s15, 0, s16
; GCN-NEXT: s_add_u32 s16, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s16
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: v_mul_hi_u32 v0, s12, v0
+; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s14, s14, s15
; GCN-NEXT: s_mul_i32 s0, s12, s14
; GCN-NEXT: v_readfirstlane_b32 s1, v0
@@ -90,7 +89,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_add_u32 s15, s16, s0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s14, s14, s12
; GCN-NEXT: s_ashr_i32 s12, s7, 31
; GCN-NEXT: s_add_u32 s0, s6, s12
@@ -116,52 +114,50 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: v_readfirstlane_b32 s4, v0
; GCN-NEXT: s_addc_u32 s4, s4, 0
; GCN-NEXT: s_mul_i32 s14, s7, s14
-; GCN-NEXT: s_add_u32 s14, s1, s14
-; GCN-NEXT: v_mov_b32_e32 v0, s14
+; GCN-NEXT: s_add_u32 s16, s1, s14
+; GCN-NEXT: v_mov_b32_e32 v0, s16
; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT: s_addc_u32 s15, 0, s4
+; GCN-NEXT: s_addc_u32 s17, 0, s4
; GCN-NEXT: s_mov_b32 s1, s5
-; GCN-NEXT: s_mul_i32 s4, s10, s15
+; GCN-NEXT: s_mul_i32 s4, s10, s17
; GCN-NEXT: v_readfirstlane_b32 s5, v0
; GCN-NEXT: s_add_i32 s4, s5, s4
-; GCN-NEXT: s_mul_i32 s5, s11, s14
-; GCN-NEXT: s_add_i32 s16, s4, s5
-; GCN-NEXT: s_sub_i32 s17, s7, s16
-; GCN-NEXT: s_mul_i32 s4, s10, s14
+; GCN-NEXT: s_mul_i32 s5, s11, s16
+; GCN-NEXT: s_add_i32 s18, s4, s5
+; GCN-NEXT: s_sub_i32 s14, s7, s18
+; GCN-NEXT: s_mul_i32 s4, s10, s16
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s18, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s18, 0
-; GCN-NEXT: s_subb_u32 s17, s17, s11
-; GCN-NEXT: s_sub_u32 s19, s6, s10
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT: s_or_b32 s15, s4, s5
+; GCN-NEXT: s_subb_u32 s19, s14, s11
+; GCN-NEXT: s_sub_u32 s20, s6, s10
+; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GCN-NEXT: s_or_b32 s14, s14, s15
+; GCN-NEXT: s_subb_u32 s14, s19, 0
+; GCN-NEXT: s_cmp_ge_u32 s14, s11
+; GCN-NEXT: s_cselect_b32 s15, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s20, s10
+; GCN-NEXT: s_cselect_b32 s19, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s14, s11
+; GCN-NEXT: s_cselect_b32 s14, s19, s15
+; GCN-NEXT: s_add_u32 s15, s16, 1
+; GCN-NEXT: s_addc_u32 s19, s17, 0
+; GCN-NEXT: s_add_u32 s20, s16, 2
+; GCN-NEXT: s_addc_u32 s21, s17, 0
+; GCN-NEXT: s_cmp_lg_u32 s14, 0
+; GCN-NEXT: s_cselect_b32 s14, s20, s15
+; GCN-NEXT: s_cselect_b32 s15, s21, s19
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_subb_u32 s4, s17, 0
+; GCN-NEXT: s_subb_u32 s4, s7, s18
; GCN-NEXT: s_cmp_ge_u32 s4, s11
; GCN-NEXT: s_cselect_b32 s5, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s19, s10
-; GCN-NEXT: s_cselect_b32 s17, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s4, s11
-; GCN-NEXT: s_cselect_b32 s4, s17, s5
-; GCN-NEXT: s_add_u32 s5, s14, 1
-; GCN-NEXT: s_addc_u32 s17, s15, 0
-; GCN-NEXT: s_add_u32 s19, s14, 2
-; GCN-NEXT: s_addc_u32 s20, s15, 0
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_cselect_b32 s4, s19, s5
-; GCN-NEXT: s_cselect_b32 s5, s20, s17
-; GCN-NEXT: s_cmp_lg_u32 s18, 0
-; GCN-NEXT: s_subb_u32 s7, s7, s16
-; GCN-NEXT: s_cmp_ge_u32 s7, s11
-; GCN-NEXT: s_cselect_b32 s16, -1, 0
; GCN-NEXT: s_cmp_ge_u32 s6, s10
; GCN-NEXT: s_cselect_b32 s6, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s7, s11
-; GCN-NEXT: s_cselect_b32 s6, s6, s16
-; GCN-NEXT: s_cmp_lg_u32 s6, 0
-; GCN-NEXT: s_cselect_b32 s5, s5, s15
-; GCN-NEXT: s_cselect_b32 s4, s4, s14
+; GCN-NEXT: s_cmp_eq_u32 s4, s11
+; GCN-NEXT: s_cselect_b32 s4, s6, s5
+; GCN-NEXT: s_cmp_lg_u32 s4, 0
+; GCN-NEXT: s_cselect_b32 s5, s15, s17
+; GCN-NEXT: s_cselect_b32 s4, s14, s16
; GCN-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9]
; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GCN-NEXT: s_sub_u32 s4, s4, s6
@@ -208,7 +204,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_add_u32 s18, s16, 1
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_or_b32 s10, s10, s11
-; GCN-IR-NEXT: s_cmp_lg_u32 s10, 0
; GCN-IR-NEXT: s_addc_u32 s10, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_sub_i32 s16, 63, s16
@@ -242,7 +237,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
; GCN-IR-NEXT: s_or_b32 s20, s20, s21
-; GCN-IR-NEXT: s_cmp_lg_u32 s20, 0
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[12:13], s[8:9]
@@ -1195,10 +1189,9 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_addc_u32 s12, 0, s13
; GCN-NEXT: s_add_u32 s13, s8, s9
; GCN-NEXT: v_mov_b32_e32 v0, s13
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
+; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: s_addc_u32 s11, s11, s12
; GCN-NEXT: s_mul_i32 s8, s2, s11
; GCN-NEXT: v_readfirstlane_b32 s9, v0
@@ -1229,7 +1222,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_add_u32 s2, s13, s2
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: s_addc_u32 s8, s11, s10
; GCN-NEXT: v_mul_hi_u32 v1, s2, 24
; GCN-NEXT: v_mul_hi_u32 v0, s8, 24
@@ -1238,48 +1230,46 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_readfirstlane_b32 s10, v1
; GCN-NEXT: v_readfirstlane_b32 s9, v0
; GCN-NEXT: s_add_u32 s8, s10, s8
-; GCN-NEXT: s_addc_u32 s10, 0, s9
-; GCN-NEXT: v_mov_b32_e32 v0, s10
+; GCN-NEXT: s_addc_u32 s12, 0, s9
+; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT: s_mul_i32 s8, s7, s10
+; GCN-NEXT: s_mul_i32 s8, s7, s12
; GCN-NEXT: v_readfirstlane_b32 s9, v0
-; GCN-NEXT: s_add_i32 s11, s9, s8
-; GCN-NEXT: s_sub_i32 s12, 0, s11
-; GCN-NEXT: s_mul_i32 s8, s6, s10
-; GCN-NEXT: s_sub_u32 s13, 24, s8
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s14, s8, s9
-; GCN-NEXT: s_cmp_lg_u32 s14, 0
-; GCN-NEXT: s_subb_u32 s12, s12, s7
-; GCN-NEXT: s_sub_u32 s15, s13, s6
+; GCN-NEXT: s_add_i32 s13, s9, s8
+; GCN-NEXT: s_sub_i32 s10, 0, s13
+; GCN-NEXT: s_mul_i32 s8, s6, s12
+; GCN-NEXT: s_sub_u32 s14, 24, s8
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT: s_or_b32 s11, s8, s9
+; GCN-NEXT: s_subb_u32 s15, s10, s7
+; GCN-NEXT: s_sub_u32 s16, s14, s6
+; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT: s_or_b32 s10, s10, s11
+; GCN-NEXT: s_subb_u32 s10, s15, 0
+; GCN-NEXT: s_cmp_ge_u32 s10, s7
+; GCN-NEXT: s_cselect_b32 s11, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s16, s6
+; GCN-NEXT: s_cselect_b32 s15, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s10, s7
+; GCN-NEXT: s_cselect_b32 s10, s15, s11
+; GCN-NEXT: s_add_u32 s11, s12, 1
+; GCN-NEXT: s_addc_u32 s15, 0, 0
+; GCN-NEXT: s_add_u32 s16, s12, 2
+; GCN-NEXT: s_addc_u32 s17, 0, 0
+; GCN-NEXT: s_cmp_lg_u32 s10, 0
+; GCN-NEXT: s_cselect_b32 s10, s16, s11
+; GCN-NEXT: s_cselect_b32 s11, s17, s15
; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
-; GCN-NEXT: s_subb_u32 s8, s12, 0
+; GCN-NEXT: s_subb_u32 s8, 0, s13
; GCN-NEXT: s_cmp_ge_u32 s8, s7
; GCN-NEXT: s_cselect_b32 s9, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s15, s6
-; GCN-NEXT: s_cselect_b32 s12, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s8, s7
-; GCN-NEXT: s_cselect_b32 s8, s12, s9
-; GCN-NEXT: s_add_u32 s9, s10, 1
-; GCN-NEXT: s_addc_u32 s12, 0, 0
-; GCN-NEXT: s_add_u32 s15, s10, 2
-; GCN-NEXT: s_addc_u32 s16, 0, 0
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
-; GCN-NEXT: s_cselect_b32 s8, s15, s9
-; GCN-NEXT: s_cselect_b32 s9, s16, s12
-; GCN-NEXT: s_cmp_lg_u32 s14, 0
-; GCN-NEXT: s_subb_u32 s11, 0, s11
-; GCN-NEXT: s_cmp_ge_u32 s11, s7
-; GCN-NEXT: s_cselect_b32 s12, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s13, s6
+; GCN-NEXT: s_cmp_ge_u32 s14, s6
; GCN-NEXT: s_cselect_b32 s6, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s11, s7
-; GCN-NEXT: s_cselect_b32 s6, s6, s12
+; GCN-NEXT: s_cmp_eq_u32 s8, s7
+; GCN-NEXT: s_cselect_b32 s6, s6, s9
; GCN-NEXT: s_cmp_lg_u32 s6, 0
-; GCN-NEXT: s_cselect_b32 s7, s9, 0
-; GCN-NEXT: s_cselect_b32 s6, s8, s10
+; GCN-NEXT: s_cselect_b32 s7, s11, 0
+; GCN-NEXT: s_cselect_b32 s6, s10, s12
; GCN-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_subb_u32 s7, s7, s4
@@ -1315,7 +1305,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s12, s10, 1
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_or_b32 s8, s8, s9
-; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
; GCN-IR-NEXT: s_addc_u32 s8, s11, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s10, 63, s10
@@ -1348,7 +1337,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s16, s16, 1
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_or_b32 s18, s18, s19
-; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0
; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
index 09596e9..7ddd90e 100644
--- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
define amdgpu_kernel void @sitofp_i16_to_f16(
; SI-LABEL: sitofp_i16_to_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index bbd1793..e12e31b 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -1513,7 +1513,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9
; GCN-NEXT: s_sub_u32 s3, 0, s8
-; GCN-NEXT: s_subb_u32 s12, 0, s9
+; GCN-NEXT: s_subb_u32 s10, 0, s9
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1522,56 +1522,52 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s13, v1
-; GCN-NEXT: v_readfirstlane_b32 s10, v0
-; GCN-NEXT: s_mul_i32 s11, s3, s13
-; GCN-NEXT: s_mul_hi_u32 s15, s3, s10
-; GCN-NEXT: s_mul_i32 s14, s12, s10
-; GCN-NEXT: s_add_i32 s11, s15, s11
-; GCN-NEXT: s_add_i32 s11, s11, s14
-; GCN-NEXT: s_mul_i32 s16, s3, s10
-; GCN-NEXT: s_mul_i32 s15, s10, s11
-; GCN-NEXT: s_mul_hi_u32 s17, s10, s16
-; GCN-NEXT: s_mul_hi_u32 s14, s10, s11
+; GCN-NEXT: v_readfirstlane_b32 s11, v1
+; GCN-NEXT: v_readfirstlane_b32 s12, v0
+; GCN-NEXT: s_mul_i32 s13, s3, s11
+; GCN-NEXT: s_mul_hi_u32 s15, s3, s12
+; GCN-NEXT: s_mul_i32 s14, s10, s12
+; GCN-NEXT: s_add_i32 s13, s15, s13
+; GCN-NEXT: s_add_i32 s13, s13, s14
+; GCN-NEXT: s_mul_i32 s16, s3, s12
+; GCN-NEXT: s_mul_i32 s15, s12, s13
+; GCN-NEXT: s_mul_hi_u32 s17, s12, s16
+; GCN-NEXT: s_mul_hi_u32 s14, s12, s13
; GCN-NEXT: s_add_u32 s15, s17, s15
; GCN-NEXT: s_addc_u32 s14, 0, s14
-; GCN-NEXT: s_mul_hi_u32 s18, s13, s16
-; GCN-NEXT: s_mul_i32 s16, s13, s16
+; GCN-NEXT: s_mul_hi_u32 s18, s11, s16
+; GCN-NEXT: s_mul_i32 s16, s11, s16
; GCN-NEXT: s_add_u32 s15, s15, s16
-; GCN-NEXT: s_mul_hi_u32 s17, s13, s11
+; GCN-NEXT: s_mul_hi_u32 s17, s11, s13
; GCN-NEXT: s_addc_u32 s14, s14, s18
; GCN-NEXT: s_addc_u32 s15, s17, 0
-; GCN-NEXT: s_mul_i32 s11, s13, s11
-; GCN-NEXT: s_add_u32 s11, s14, s11
+; GCN-NEXT: s_mul_i32 s13, s11, s13
+; GCN-NEXT: s_add_u32 s13, s14, s13
; GCN-NEXT: s_addc_u32 s14, 0, s15
-; GCN-NEXT: s_add_u32 s15, s10, s11
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GCN-NEXT: s_addc_u32 s13, s13, s14
-; GCN-NEXT: s_mul_i32 s10, s3, s13
-; GCN-NEXT: s_mul_hi_u32 s11, s3, s15
-; GCN-NEXT: s_add_i32 s10, s11, s10
-; GCN-NEXT: s_mul_i32 s12, s12, s15
-; GCN-NEXT: s_add_i32 s10, s10, s12
-; GCN-NEXT: s_mul_i32 s3, s3, s15
-; GCN-NEXT: s_mul_hi_u32 s12, s13, s3
-; GCN-NEXT: s_mul_i32 s14, s13, s3
-; GCN-NEXT: s_mul_i32 s17, s15, s10
-; GCN-NEXT: s_mul_hi_u32 s3, s15, s3
-; GCN-NEXT: s_mul_hi_u32 s16, s15, s10
+; GCN-NEXT: s_add_u32 s12, s12, s13
+; GCN-NEXT: s_addc_u32 s11, s11, s14
+; GCN-NEXT: s_mul_i32 s13, s3, s11
+; GCN-NEXT: s_mul_hi_u32 s14, s3, s12
+; GCN-NEXT: s_add_i32 s13, s14, s13
+; GCN-NEXT: s_mul_i32 s10, s10, s12
+; GCN-NEXT: s_add_i32 s13, s13, s10
+; GCN-NEXT: s_mul_i32 s3, s3, s12
+; GCN-NEXT: s_mul_hi_u32 s14, s11, s3
+; GCN-NEXT: s_mul_i32 s15, s11, s3
+; GCN-NEXT: s_mul_i32 s17, s12, s13
+; GCN-NEXT: s_mul_hi_u32 s3, s12, s3
+; GCN-NEXT: s_mul_hi_u32 s16, s12, s13
; GCN-NEXT: s_add_u32 s3, s3, s17
; GCN-NEXT: s_addc_u32 s16, 0, s16
-; GCN-NEXT: s_add_u32 s3, s3, s14
-; GCN-NEXT: s_mul_hi_u32 s11, s13, s10
-; GCN-NEXT: s_addc_u32 s3, s16, s12
-; GCN-NEXT: s_addc_u32 s11, s11, 0
-; GCN-NEXT: s_mul_i32 s10, s13, s10
-; GCN-NEXT: s_add_u32 s3, s3, s10
-; GCN-NEXT: s_addc_u32 s12, 0, s11
-; GCN-NEXT: s_add_u32 s3, s15, s3
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GCN-NEXT: s_addc_u32 s14, s13, s12
+; GCN-NEXT: s_add_u32 s3, s3, s15
+; GCN-NEXT: s_mul_hi_u32 s10, s11, s13
+; GCN-NEXT: s_addc_u32 s3, s16, s14
+; GCN-NEXT: s_addc_u32 s10, s10, 0
+; GCN-NEXT: s_mul_i32 s13, s11, s13
+; GCN-NEXT: s_add_u32 s3, s3, s13
+; GCN-NEXT: s_addc_u32 s10, 0, s10
+; GCN-NEXT: s_add_u32 s3, s12, s3
+; GCN-NEXT: s_addc_u32 s14, s11, s10
; GCN-NEXT: s_ashr_i32 s10, s5, 31
; GCN-NEXT: s_add_u32 s12, s4, s10
; GCN-NEXT: s_mov_b32 s11, s10
@@ -1600,11 +1596,9 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: s_mul_i32 s3, s8, s3
; GCN-NEXT: s_sub_u32 s3, s12, s3
; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
; GCN-NEXT: s_subb_u32 s12, s16, s9
; GCN-NEXT: s_sub_u32 s18, s3, s8
; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s19, s12, 0
; GCN-NEXT: s_cmp_ge_u32 s19, s9
; GCN-NEXT: s_cselect_b32 s20, -1, 0
@@ -1614,12 +1608,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: s_cselect_b32 s20, s21, s20
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s12, s12, s9
-; GCN-NEXT: s_sub_u32 s21, s18, s8
-; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
+; GCN-NEXT: s_sub_u32 s16, s18, s8
; GCN-NEXT: s_subb_u32 s12, s12, 0
; GCN-NEXT: s_cmp_lg_u32 s20, 0
-; GCN-NEXT: s_cselect_b32 s16, s21, s18
+; GCN-NEXT: s_cselect_b32 s16, s16, s18
; GCN-NEXT: s_cselect_b32 s12, s12, s19
; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
; GCN-NEXT: s_subb_u32 s5, s13, s5
@@ -1931,11 +1923,9 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; TONGA-NEXT: v_readfirstlane_b32 s14, v0
; TONGA-NEXT: s_sub_u32 s12, s12, s14
; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s3, s3, s7
; TONGA-NEXT: s_sub_u32 s18, s12, s6
; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s19, s3, 0
; TONGA-NEXT: s_cmp_ge_u32 s19, s7
; TONGA-NEXT: s_cselect_b32 s20, -1, 0
@@ -1945,12 +1935,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; TONGA-NEXT: s_cselect_b32 s20, s21, s20
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s3, s3, s7
-; TONGA-NEXT: s_sub_u32 s21, s18, s6
-; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
+; TONGA-NEXT: s_sub_u32 s16, s18, s6
; TONGA-NEXT: s_subb_u32 s3, s3, 0
; TONGA-NEXT: s_cmp_lg_u32 s20, 0
-; TONGA-NEXT: s_cselect_b32 s16, s21, s18
+; TONGA-NEXT: s_cselect_b32 s16, s16, s18
; TONGA-NEXT: s_cselect_b32 s3, s3, s19
; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s5, s13, s5
@@ -2730,7 +2718,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7
; GCN-NEXT: s_sub_u32 s9, 0, s6
-; GCN-NEXT: s_subb_u32 s16, 0, s7
+; GCN-NEXT: s_subb_u32 s14, 0, s7
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2739,56 +2727,52 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s17, v1
-; GCN-NEXT: v_readfirstlane_b32 s14, v0
-; GCN-NEXT: s_mul_i32 s15, s9, s17
-; GCN-NEXT: s_mul_hi_u32 s19, s9, s14
-; GCN-NEXT: s_mul_i32 s18, s16, s14
-; GCN-NEXT: s_add_i32 s15, s19, s15
-; GCN-NEXT: s_add_i32 s15, s15, s18
-; GCN-NEXT: s_mul_i32 s20, s9, s14
-; GCN-NEXT: s_mul_i32 s19, s14, s15
-; GCN-NEXT: s_mul_hi_u32 s21, s14, s20
-; GCN-NEXT: s_mul_hi_u32 s18, s14, s15
+; GCN-NEXT: v_readfirstlane_b32 s15, v1
+; GCN-NEXT: v_readfirstlane_b32 s16, v0
+; GCN-NEXT: s_mul_i32 s17, s9, s15
+; GCN-NEXT: s_mul_hi_u32 s19, s9, s16
+; GCN-NEXT: s_mul_i32 s18, s14, s16
+; GCN-NEXT: s_add_i32 s17, s19, s17
+; GCN-NEXT: s_add_i32 s17, s17, s18
+; GCN-NEXT: s_mul_i32 s20, s9, s16
+; GCN-NEXT: s_mul_i32 s19, s16, s17
+; GCN-NEXT: s_mul_hi_u32 s21, s16, s20
+; GCN-NEXT: s_mul_hi_u32 s18, s16, s17
; GCN-NEXT: s_add_u32 s19, s21, s19
; GCN-NEXT: s_addc_u32 s18, 0, s18
-; GCN-NEXT: s_mul_hi_u32 s22, s17, s20
-; GCN-NEXT: s_mul_i32 s20, s17, s20
+; GCN-NEXT: s_mul_hi_u32 s22, s15, s20
+; GCN-NEXT: s_mul_i32 s20, s15, s20
; GCN-NEXT: s_add_u32 s19, s19, s20
-; GCN-NEXT: s_mul_hi_u32 s21, s17, s15
+; GCN-NEXT: s_mul_hi_u32 s21, s15, s17
; GCN-NEXT: s_addc_u32 s18, s18, s22
; GCN-NEXT: s_addc_u32 s19, s21, 0
-; GCN-NEXT: s_mul_i32 s15, s17, s15
-; GCN-NEXT: s_add_u32 s15, s18, s15
+; GCN-NEXT: s_mul_i32 s17, s15, s17
+; GCN-NEXT: s_add_u32 s17, s18, s17
; GCN-NEXT: s_addc_u32 s18, 0, s19
-; GCN-NEXT: s_add_u32 s19, s14, s15
-; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
-; GCN-NEXT: s_addc_u32 s17, s17, s18
-; GCN-NEXT: s_mul_i32 s14, s9, s17
-; GCN-NEXT: s_mul_hi_u32 s15, s9, s19
-; GCN-NEXT: s_add_i32 s14, s15, s14
-; GCN-NEXT: s_mul_i32 s16, s16, s19
-; GCN-NEXT: s_add_i32 s14, s14, s16
-; GCN-NEXT: s_mul_i32 s9, s9, s19
-; GCN-NEXT: s_mul_hi_u32 s16, s17, s9
-; GCN-NEXT: s_mul_i32 s18, s17, s9
-; GCN-NEXT: s_mul_i32 s21, s19, s14
-; GCN-NEXT: s_mul_hi_u32 s9, s19, s9
-; GCN-NEXT: s_mul_hi_u32 s20, s19, s14
+; GCN-NEXT: s_add_u32 s16, s16, s17
+; GCN-NEXT: s_addc_u32 s15, s15, s18
+; GCN-NEXT: s_mul_i32 s17, s9, s15
+; GCN-NEXT: s_mul_hi_u32 s18, s9, s16
+; GCN-NEXT: s_add_i32 s17, s18, s17
+; GCN-NEXT: s_mul_i32 s14, s14, s16
+; GCN-NEXT: s_add_i32 s17, s17, s14
+; GCN-NEXT: s_mul_i32 s9, s9, s16
+; GCN-NEXT: s_mul_hi_u32 s18, s15, s9
+; GCN-NEXT: s_mul_i32 s19, s15, s9
+; GCN-NEXT: s_mul_i32 s21, s16, s17
+; GCN-NEXT: s_mul_hi_u32 s9, s16, s9
+; GCN-NEXT: s_mul_hi_u32 s20, s16, s17
; GCN-NEXT: s_add_u32 s9, s9, s21
; GCN-NEXT: s_addc_u32 s20, 0, s20
-; GCN-NEXT: s_add_u32 s9, s9, s18
-; GCN-NEXT: s_mul_hi_u32 s15, s17, s14
-; GCN-NEXT: s_addc_u32 s9, s20, s16
-; GCN-NEXT: s_addc_u32 s15, s15, 0
-; GCN-NEXT: s_mul_i32 s14, s17, s14
-; GCN-NEXT: s_add_u32 s9, s9, s14
-; GCN-NEXT: s_addc_u32 s16, 0, s15
-; GCN-NEXT: s_add_u32 s9, s19, s9
-; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
-; GCN-NEXT: s_addc_u32 s18, s17, s16
+; GCN-NEXT: s_add_u32 s9, s9, s19
+; GCN-NEXT: s_mul_hi_u32 s14, s15, s17
+; GCN-NEXT: s_addc_u32 s9, s20, s18
+; GCN-NEXT: s_addc_u32 s14, s14, 0
+; GCN-NEXT: s_mul_i32 s17, s15, s17
+; GCN-NEXT: s_add_u32 s9, s9, s17
+; GCN-NEXT: s_addc_u32 s14, 0, s14
+; GCN-NEXT: s_add_u32 s9, s16, s9
+; GCN-NEXT: s_addc_u32 s18, s15, s14
; GCN-NEXT: s_ashr_i32 s14, s11, 31
; GCN-NEXT: s_add_u32 s16, s10, s14
; GCN-NEXT: s_mov_b32 s15, s14
@@ -2817,11 +2801,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s9, s6, s9
; GCN-NEXT: s_sub_u32 s9, s16, s9
; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s16, s20, s7
; GCN-NEXT: s_sub_u32 s22, s9, s6
; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s23, s16, 0
; GCN-NEXT: s_cmp_ge_u32 s23, s7
; GCN-NEXT: s_cselect_b32 s24, -1, 0
@@ -2831,12 +2813,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s24, s25, s24
; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s16, s16, s7
-; GCN-NEXT: s_sub_u32 s25, s22, s6
-; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
+; GCN-NEXT: s_sub_u32 s20, s22, s6
; GCN-NEXT: s_subb_u32 s16, s16, 0
; GCN-NEXT: s_cmp_lg_u32 s24, 0
-; GCN-NEXT: s_cselect_b32 s20, s25, s22
+; GCN-NEXT: s_cselect_b32 s20, s20, s22
; GCN-NEXT: s_cselect_b32 s16, s16, s23
; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s11, s17, s11
@@ -2887,7 +2867,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11
; GCN-NEXT: s_sub_u32 s3, 0, s10
-; GCN-NEXT: s_subb_u32 s14, 0, s11
+; GCN-NEXT: s_subb_u32 s12, 0, s11
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2896,56 +2876,52 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s15, v1
-; GCN-NEXT: v_readfirstlane_b32 s12, v0
-; GCN-NEXT: s_mul_i32 s13, s3, s15
-; GCN-NEXT: s_mul_hi_u32 s17, s3, s12
-; GCN-NEXT: s_mul_i32 s16, s14, s12
-; GCN-NEXT: s_add_i32 s13, s17, s13
-; GCN-NEXT: s_add_i32 s13, s13, s16
-; GCN-NEXT: s_mul_i32 s18, s3, s12
-; GCN-NEXT: s_mul_i32 s17, s12, s13
-; GCN-NEXT: s_mul_hi_u32 s19, s12, s18
-; GCN-NEXT: s_mul_hi_u32 s16, s12, s13
+; GCN-NEXT: v_readfirstlane_b32 s13, v1
+; GCN-NEXT: v_readfirstlane_b32 s14, v0
+; GCN-NEXT: s_mul_i32 s15, s3, s13
+; GCN-NEXT: s_mul_hi_u32 s17, s3, s14
+; GCN-NEXT: s_mul_i32 s16, s12, s14
+; GCN-NEXT: s_add_i32 s15, s17, s15
+; GCN-NEXT: s_add_i32 s15, s15, s16
+; GCN-NEXT: s_mul_i32 s18, s3, s14
+; GCN-NEXT: s_mul_i32 s17, s14, s15
+; GCN-NEXT: s_mul_hi_u32 s19, s14, s18
+; GCN-NEXT: s_mul_hi_u32 s16, s14, s15
; GCN-NEXT: s_add_u32 s17, s19, s17
; GCN-NEXT: s_addc_u32 s16, 0, s16
-; GCN-NEXT: s_mul_hi_u32 s20, s15, s18
-; GCN-NEXT: s_mul_i32 s18, s15, s18
+; GCN-NEXT: s_mul_hi_u32 s20, s13, s18
+; GCN-NEXT: s_mul_i32 s18, s13, s18
; GCN-NEXT: s_add_u32 s17, s17, s18
-; GCN-NEXT: s_mul_hi_u32 s19, s15, s13
+; GCN-NEXT: s_mul_hi_u32 s19, s13, s15
; GCN-NEXT: s_addc_u32 s16, s16, s20
; GCN-NEXT: s_addc_u32 s17, s19, 0
-; GCN-NEXT: s_mul_i32 s13, s15, s13
-; GCN-NEXT: s_add_u32 s13, s16, s13
+; GCN-NEXT: s_mul_i32 s15, s13, s15
+; GCN-NEXT: s_add_u32 s15, s16, s15
; GCN-NEXT: s_addc_u32 s16, 0, s17
-; GCN-NEXT: s_add_u32 s17, s12, s13
-; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT: s_addc_u32 s15, s15, s16
-; GCN-NEXT: s_mul_i32 s12, s3, s15
-; GCN-NEXT: s_mul_hi_u32 s13, s3, s17
-; GCN-NEXT: s_add_i32 s12, s13, s12
-; GCN-NEXT: s_mul_i32 s14, s14, s17
-; GCN-NEXT: s_add_i32 s12, s12, s14
-; GCN-NEXT: s_mul_i32 s3, s3, s17
-; GCN-NEXT: s_mul_hi_u32 s14, s15, s3
-; GCN-NEXT: s_mul_i32 s16, s15, s3
-; GCN-NEXT: s_mul_i32 s19, s17, s12
-; GCN-NEXT: s_mul_hi_u32 s3, s17, s3
-; GCN-NEXT: s_mul_hi_u32 s18, s17, s12
+; GCN-NEXT: s_add_u32 s14, s14, s15
+; GCN-NEXT: s_addc_u32 s13, s13, s16
+; GCN-NEXT: s_mul_i32 s15, s3, s13
+; GCN-NEXT: s_mul_hi_u32 s16, s3, s14
+; GCN-NEXT: s_add_i32 s15, s16, s15
+; GCN-NEXT: s_mul_i32 s12, s12, s14
+; GCN-NEXT: s_add_i32 s15, s15, s12
+; GCN-NEXT: s_mul_i32 s3, s3, s14
+; GCN-NEXT: s_mul_hi_u32 s16, s13, s3
+; GCN-NEXT: s_mul_i32 s17, s13, s3
+; GCN-NEXT: s_mul_i32 s19, s14, s15
+; GCN-NEXT: s_mul_hi_u32 s3, s14, s3
+; GCN-NEXT: s_mul_hi_u32 s18, s14, s15
; GCN-NEXT: s_add_u32 s3, s3, s19
; GCN-NEXT: s_addc_u32 s18, 0, s18
-; GCN-NEXT: s_add_u32 s3, s3, s16
-; GCN-NEXT: s_mul_hi_u32 s13, s15, s12
-; GCN-NEXT: s_addc_u32 s3, s18, s14
-; GCN-NEXT: s_addc_u32 s13, s13, 0
-; GCN-NEXT: s_mul_i32 s12, s15, s12
-; GCN-NEXT: s_add_u32 s3, s3, s12
-; GCN-NEXT: s_addc_u32 s14, 0, s13
-; GCN-NEXT: s_add_u32 s3, s17, s3
-; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT: s_addc_u32 s16, s15, s14
+; GCN-NEXT: s_add_u32 s3, s3, s17
+; GCN-NEXT: s_mul_hi_u32 s12, s13, s15
+; GCN-NEXT: s_addc_u32 s3, s18, s16
+; GCN-NEXT: s_addc_u32 s12, s12, 0
+; GCN-NEXT: s_mul_i32 s15, s13, s15
+; GCN-NEXT: s_add_u32 s3, s3, s15
+; GCN-NEXT: s_addc_u32 s12, 0, s12
+; GCN-NEXT: s_add_u32 s3, s14, s3
+; GCN-NEXT: s_addc_u32 s16, s13, s12
; GCN-NEXT: s_ashr_i32 s12, s5, 31
; GCN-NEXT: s_add_u32 s14, s4, s12
; GCN-NEXT: s_mov_b32 s13, s12
@@ -2974,11 +2950,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s3, s10, s3
; GCN-NEXT: s_sub_u32 s3, s14, s3
; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s14, s18, s11
; GCN-NEXT: s_sub_u32 s20, s3, s10
; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s21, s14, 0
; GCN-NEXT: s_cmp_ge_u32 s21, s11
; GCN-NEXT: s_cselect_b32 s22, -1, 0
@@ -2988,12 +2962,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s22, s23, s22
; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s14, s14, s11
-; GCN-NEXT: s_sub_u32 s23, s20, s10
-; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
+; GCN-NEXT: s_sub_u32 s18, s20, s10
; GCN-NEXT: s_subb_u32 s14, s14, 0
; GCN-NEXT: s_cmp_lg_u32 s22, 0
-; GCN-NEXT: s_cselect_b32 s18, s23, s20
+; GCN-NEXT: s_cselect_b32 s18, s18, s20
; GCN-NEXT: s_cselect_b32 s14, s14, s21
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s5, s15, s5
@@ -3463,11 +3435,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_readfirstlane_b32 s14, v0
; TONGA-NEXT: s_sub_u32 s12, s12, s14
; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s1, s1, s7
; TONGA-NEXT: s_sub_u32 s18, s12, s6
; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s19, s1, 0
; TONGA-NEXT: s_cmp_ge_u32 s19, s7
; TONGA-NEXT: s_cselect_b32 s20, -1, 0
@@ -3477,12 +3447,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: s_cselect_b32 s20, s21, s20
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s1, s1, s7
-; TONGA-NEXT: s_sub_u32 s21, s18, s6
-; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
+; TONGA-NEXT: s_sub_u32 s16, s18, s6
; TONGA-NEXT: s_subb_u32 s1, s1, 0
; TONGA-NEXT: s_cmp_lg_u32 s20, 0
-; TONGA-NEXT: s_cselect_b32 s16, s21, s18
+; TONGA-NEXT: s_cselect_b32 s16, s16, s18
; TONGA-NEXT: s_cselect_b32 s1, s1, s19
; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s3, s13, s3
@@ -4934,7 +4902,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7
; GCN-NEXT: s_sub_u32 s17, 0, s6
-; GCN-NEXT: s_subb_u32 s24, 0, s7
+; GCN-NEXT: s_subb_u32 s22, 0, s7
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -4943,56 +4911,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s25, v1
-; GCN-NEXT: v_readfirstlane_b32 s22, v0
-; GCN-NEXT: s_mul_i32 s23, s17, s25
-; GCN-NEXT: s_mul_hi_u32 s27, s17, s22
-; GCN-NEXT: s_mul_i32 s26, s24, s22
-; GCN-NEXT: s_add_i32 s23, s27, s23
-; GCN-NEXT: s_add_i32 s23, s23, s26
-; GCN-NEXT: s_mul_i32 s28, s17, s22
-; GCN-NEXT: s_mul_i32 s27, s22, s23
-; GCN-NEXT: s_mul_hi_u32 s29, s22, s28
-; GCN-NEXT: s_mul_hi_u32 s26, s22, s23
+; GCN-NEXT: v_readfirstlane_b32 s23, v1
+; GCN-NEXT: v_readfirstlane_b32 s24, v0
+; GCN-NEXT: s_mul_i32 s25, s17, s23
+; GCN-NEXT: s_mul_hi_u32 s27, s17, s24
+; GCN-NEXT: s_mul_i32 s26, s22, s24
+; GCN-NEXT: s_add_i32 s25, s27, s25
+; GCN-NEXT: s_add_i32 s25, s25, s26
+; GCN-NEXT: s_mul_i32 s28, s17, s24
+; GCN-NEXT: s_mul_i32 s27, s24, s25
+; GCN-NEXT: s_mul_hi_u32 s29, s24, s28
+; GCN-NEXT: s_mul_hi_u32 s26, s24, s25
; GCN-NEXT: s_add_u32 s27, s29, s27
; GCN-NEXT: s_addc_u32 s26, 0, s26
-; GCN-NEXT: s_mul_hi_u32 s30, s25, s28
-; GCN-NEXT: s_mul_i32 s28, s25, s28
+; GCN-NEXT: s_mul_hi_u32 s30, s23, s28
+; GCN-NEXT: s_mul_i32 s28, s23, s28
; GCN-NEXT: s_add_u32 s27, s27, s28
-; GCN-NEXT: s_mul_hi_u32 s29, s25, s23
+; GCN-NEXT: s_mul_hi_u32 s29, s23, s25
; GCN-NEXT: s_addc_u32 s26, s26, s30
; GCN-NEXT: s_addc_u32 s27, s29, 0
-; GCN-NEXT: s_mul_i32 s23, s25, s23
-; GCN-NEXT: s_add_u32 s23, s26, s23
+; GCN-NEXT: s_mul_i32 s25, s23, s25
+; GCN-NEXT: s_add_u32 s25, s26, s25
; GCN-NEXT: s_addc_u32 s26, 0, s27
-; GCN-NEXT: s_add_u32 s27, s22, s23
-; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
-; GCN-NEXT: s_addc_u32 s25, s25, s26
-; GCN-NEXT: s_mul_i32 s22, s17, s25
-; GCN-NEXT: s_mul_hi_u32 s23, s17, s27
-; GCN-NEXT: s_add_i32 s22, s23, s22
-; GCN-NEXT: s_mul_i32 s24, s24, s27
-; GCN-NEXT: s_add_i32 s22, s22, s24
-; GCN-NEXT: s_mul_i32 s17, s17, s27
-; GCN-NEXT: s_mul_hi_u32 s24, s25, s17
-; GCN-NEXT: s_mul_i32 s26, s25, s17
-; GCN-NEXT: s_mul_i32 s29, s27, s22
-; GCN-NEXT: s_mul_hi_u32 s17, s27, s17
-; GCN-NEXT: s_mul_hi_u32 s28, s27, s22
+; GCN-NEXT: s_add_u32 s24, s24, s25
+; GCN-NEXT: s_addc_u32 s23, s23, s26
+; GCN-NEXT: s_mul_i32 s25, s17, s23
+; GCN-NEXT: s_mul_hi_u32 s26, s17, s24
+; GCN-NEXT: s_add_i32 s25, s26, s25
+; GCN-NEXT: s_mul_i32 s22, s22, s24
+; GCN-NEXT: s_add_i32 s25, s25, s22
+; GCN-NEXT: s_mul_i32 s17, s17, s24
+; GCN-NEXT: s_mul_hi_u32 s26, s23, s17
+; GCN-NEXT: s_mul_i32 s27, s23, s17
+; GCN-NEXT: s_mul_i32 s29, s24, s25
+; GCN-NEXT: s_mul_hi_u32 s17, s24, s17
+; GCN-NEXT: s_mul_hi_u32 s28, s24, s25
; GCN-NEXT: s_add_u32 s17, s17, s29
; GCN-NEXT: s_addc_u32 s28, 0, s28
-; GCN-NEXT: s_add_u32 s17, s17, s26
-; GCN-NEXT: s_mul_hi_u32 s23, s25, s22
-; GCN-NEXT: s_addc_u32 s17, s28, s24
-; GCN-NEXT: s_addc_u32 s23, s23, 0
-; GCN-NEXT: s_mul_i32 s22, s25, s22
-; GCN-NEXT: s_add_u32 s17, s17, s22
-; GCN-NEXT: s_addc_u32 s24, 0, s23
-; GCN-NEXT: s_add_u32 s17, s27, s17
-; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
-; GCN-NEXT: s_addc_u32 s26, s25, s24
+; GCN-NEXT: s_add_u32 s17, s17, s27
+; GCN-NEXT: s_mul_hi_u32 s22, s23, s25
+; GCN-NEXT: s_addc_u32 s17, s28, s26
+; GCN-NEXT: s_addc_u32 s22, s22, 0
+; GCN-NEXT: s_mul_i32 s25, s23, s25
+; GCN-NEXT: s_add_u32 s17, s17, s25
+; GCN-NEXT: s_addc_u32 s22, 0, s22
+; GCN-NEXT: s_add_u32 s17, s24, s17
+; GCN-NEXT: s_addc_u32 s26, s23, s22
; GCN-NEXT: s_ashr_i32 s22, s19, 31
; GCN-NEXT: s_add_u32 s24, s18, s22
; GCN-NEXT: s_mov_b32 s23, s22
@@ -5021,11 +4985,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s17, s6, s17
; GCN-NEXT: s_sub_u32 s17, s24, s17
; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s24, s28, s7
; GCN-NEXT: s_sub_u32 s30, s17, s6
; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0
; GCN-NEXT: s_subb_u32 s31, s24, 0
; GCN-NEXT: s_cmp_ge_u32 s31, s7
; GCN-NEXT: s_cselect_b32 s33, -1, 0
@@ -5035,12 +4997,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s33, s34, s33
; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0
; GCN-NEXT: s_subb_u32 s24, s24, s7
-; GCN-NEXT: s_sub_u32 s34, s30, s6
-; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0
+; GCN-NEXT: s_sub_u32 s28, s30, s6
; GCN-NEXT: s_subb_u32 s24, s24, 0
; GCN-NEXT: s_cmp_lg_u32 s33, 0
-; GCN-NEXT: s_cselect_b32 s28, s34, s30
+; GCN-NEXT: s_cselect_b32 s28, s28, s30
; GCN-NEXT: s_cselect_b32 s24, s24, s31
; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s19, s25, s19
@@ -5091,7 +5051,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s18
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s19
; GCN-NEXT: s_sub_u32 s13, 0, s18
-; GCN-NEXT: s_subb_u32 s22, 0, s19
+; GCN-NEXT: s_subb_u32 s20, 0, s19
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5100,56 +5060,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s23, v1
-; GCN-NEXT: v_readfirstlane_b32 s20, v0
-; GCN-NEXT: s_mul_i32 s21, s13, s23
-; GCN-NEXT: s_mul_hi_u32 s25, s13, s20
-; GCN-NEXT: s_mul_i32 s24, s22, s20
-; GCN-NEXT: s_add_i32 s21, s25, s21
-; GCN-NEXT: s_add_i32 s21, s21, s24
-; GCN-NEXT: s_mul_i32 s26, s13, s20
-; GCN-NEXT: s_mul_i32 s25, s20, s21
-; GCN-NEXT: s_mul_hi_u32 s27, s20, s26
-; GCN-NEXT: s_mul_hi_u32 s24, s20, s21
+; GCN-NEXT: v_readfirstlane_b32 s21, v1
+; GCN-NEXT: v_readfirstlane_b32 s22, v0
+; GCN-NEXT: s_mul_i32 s23, s13, s21
+; GCN-NEXT: s_mul_hi_u32 s25, s13, s22
+; GCN-NEXT: s_mul_i32 s24, s20, s22
+; GCN-NEXT: s_add_i32 s23, s25, s23
+; GCN-NEXT: s_add_i32 s23, s23, s24
+; GCN-NEXT: s_mul_i32 s26, s13, s22
+; GCN-NEXT: s_mul_i32 s25, s22, s23
+; GCN-NEXT: s_mul_hi_u32 s27, s22, s26
+; GCN-NEXT: s_mul_hi_u32 s24, s22, s23
; GCN-NEXT: s_add_u32 s25, s27, s25
; GCN-NEXT: s_addc_u32 s24, 0, s24
-; GCN-NEXT: s_mul_hi_u32 s28, s23, s26
-; GCN-NEXT: s_mul_i32 s26, s23, s26
+; GCN-NEXT: s_mul_hi_u32 s28, s21, s26
+; GCN-NEXT: s_mul_i32 s26, s21, s26
; GCN-NEXT: s_add_u32 s25, s25, s26
-; GCN-NEXT: s_mul_hi_u32 s27, s23, s21
+; GCN-NEXT: s_mul_hi_u32 s27, s21, s23
; GCN-NEXT: s_addc_u32 s24, s24, s28
; GCN-NEXT: s_addc_u32 s25, s27, 0
-; GCN-NEXT: s_mul_i32 s21, s23, s21
-; GCN-NEXT: s_add_u32 s21, s24, s21
+; GCN-NEXT: s_mul_i32 s23, s21, s23
+; GCN-NEXT: s_add_u32 s23, s24, s23
; GCN-NEXT: s_addc_u32 s24, 0, s25
-; GCN-NEXT: s_add_u32 s25, s20, s21
-; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
-; GCN-NEXT: s_addc_u32 s23, s23, s24
-; GCN-NEXT: s_mul_i32 s20, s13, s23
-; GCN-NEXT: s_mul_hi_u32 s21, s13, s25
-; GCN-NEXT: s_add_i32 s20, s21, s20
-; GCN-NEXT: s_mul_i32 s22, s22, s25
-; GCN-NEXT: s_add_i32 s20, s20, s22
-; GCN-NEXT: s_mul_i32 s13, s13, s25
-; GCN-NEXT: s_mul_hi_u32 s22, s23, s13
-; GCN-NEXT: s_mul_i32 s24, s23, s13
-; GCN-NEXT: s_mul_i32 s27, s25, s20
-; GCN-NEXT: s_mul_hi_u32 s13, s25, s13
-; GCN-NEXT: s_mul_hi_u32 s26, s25, s20
+; GCN-NEXT: s_add_u32 s22, s22, s23
+; GCN-NEXT: s_addc_u32 s21, s21, s24
+; GCN-NEXT: s_mul_i32 s23, s13, s21
+; GCN-NEXT: s_mul_hi_u32 s24, s13, s22
+; GCN-NEXT: s_add_i32 s23, s24, s23
+; GCN-NEXT: s_mul_i32 s20, s20, s22
+; GCN-NEXT: s_add_i32 s23, s23, s20
+; GCN-NEXT: s_mul_i32 s13, s13, s22
+; GCN-NEXT: s_mul_hi_u32 s24, s21, s13
+; GCN-NEXT: s_mul_i32 s25, s21, s13
+; GCN-NEXT: s_mul_i32 s27, s22, s23
+; GCN-NEXT: s_mul_hi_u32 s13, s22, s13
+; GCN-NEXT: s_mul_hi_u32 s26, s22, s23
; GCN-NEXT: s_add_u32 s13, s13, s27
; GCN-NEXT: s_addc_u32 s26, 0, s26
-; GCN-NEXT: s_add_u32 s13, s13, s24
-; GCN-NEXT: s_mul_hi_u32 s21, s23, s20
-; GCN-NEXT: s_addc_u32 s13, s26, s22
-; GCN-NEXT: s_addc_u32 s21, s21, 0
-; GCN-NEXT: s_mul_i32 s20, s23, s20
-; GCN-NEXT: s_add_u32 s13, s13, s20
-; GCN-NEXT: s_addc_u32 s22, 0, s21
-; GCN-NEXT: s_add_u32 s13, s25, s13
-; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
-; GCN-NEXT: s_addc_u32 s24, s23, s22
+; GCN-NEXT: s_add_u32 s13, s13, s25
+; GCN-NEXT: s_mul_hi_u32 s20, s21, s23
+; GCN-NEXT: s_addc_u32 s13, s26, s24
+; GCN-NEXT: s_addc_u32 s20, s20, 0
+; GCN-NEXT: s_mul_i32 s23, s21, s23
+; GCN-NEXT: s_add_u32 s13, s13, s23
+; GCN-NEXT: s_addc_u32 s20, 0, s20
+; GCN-NEXT: s_add_u32 s13, s22, s13
+; GCN-NEXT: s_addc_u32 s24, s21, s20
; GCN-NEXT: s_ashr_i32 s20, s15, 31
; GCN-NEXT: s_add_u32 s22, s14, s20
; GCN-NEXT: s_mov_b32 s21, s20
@@ -5178,11 +5134,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s13, s18, s13
; GCN-NEXT: s_sub_u32 s13, s22, s13
; GCN-NEXT: s_cselect_b64 s[24:25], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0
; GCN-NEXT: s_subb_u32 s22, s26, s19
; GCN-NEXT: s_sub_u32 s28, s13, s18
; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s29, s22, 0
; GCN-NEXT: s_cmp_ge_u32 s29, s19
; GCN-NEXT: s_cselect_b32 s30, -1, 0
@@ -5192,12 +5146,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s30, s31, s30
; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s22, s22, s19
-; GCN-NEXT: s_sub_u32 s31, s28, s18
-; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
+; GCN-NEXT: s_sub_u32 s26, s28, s18
; GCN-NEXT: s_subb_u32 s22, s22, 0
; GCN-NEXT: s_cmp_lg_u32 s30, 0
-; GCN-NEXT: s_cselect_b32 s26, s31, s28
+; GCN-NEXT: s_cselect_b32 s26, s26, s28
; GCN-NEXT: s_cselect_b32 s22, s22, s29
; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0
; GCN-NEXT: s_subb_u32 s15, s23, s15
@@ -5257,7 +5209,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s14
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s15
; GCN-NEXT: s_sub_u32 s9, 0, s14
-; GCN-NEXT: s_subb_u32 s18, 0, s15
+; GCN-NEXT: s_subb_u32 s16, 0, s15
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5266,56 +5218,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s19, v1
-; GCN-NEXT: v_readfirstlane_b32 s16, v0
-; GCN-NEXT: s_mul_i32 s17, s9, s19
-; GCN-NEXT: s_mul_hi_u32 s21, s9, s16
-; GCN-NEXT: s_mul_i32 s20, s18, s16
-; GCN-NEXT: s_add_i32 s17, s21, s17
-; GCN-NEXT: s_add_i32 s17, s17, s20
-; GCN-NEXT: s_mul_i32 s22, s9, s16
-; GCN-NEXT: s_mul_i32 s21, s16, s17
-; GCN-NEXT: s_mul_hi_u32 s23, s16, s22
-; GCN-NEXT: s_mul_hi_u32 s20, s16, s17
+; GCN-NEXT: v_readfirstlane_b32 s17, v1
+; GCN-NEXT: v_readfirstlane_b32 s18, v0
+; GCN-NEXT: s_mul_i32 s19, s9, s17
+; GCN-NEXT: s_mul_hi_u32 s21, s9, s18
+; GCN-NEXT: s_mul_i32 s20, s16, s18
+; GCN-NEXT: s_add_i32 s19, s21, s19
+; GCN-NEXT: s_add_i32 s19, s19, s20
+; GCN-NEXT: s_mul_i32 s22, s9, s18
+; GCN-NEXT: s_mul_i32 s21, s18, s19
+; GCN-NEXT: s_mul_hi_u32 s23, s18, s22
+; GCN-NEXT: s_mul_hi_u32 s20, s18, s19
; GCN-NEXT: s_add_u32 s21, s23, s21
; GCN-NEXT: s_addc_u32 s20, 0, s20
-; GCN-NEXT: s_mul_hi_u32 s24, s19, s22
-; GCN-NEXT: s_mul_i32 s22, s19, s22
+; GCN-NEXT: s_mul_hi_u32 s24, s17, s22
+; GCN-NEXT: s_mul_i32 s22, s17, s22
; GCN-NEXT: s_add_u32 s21, s21, s22
-; GCN-NEXT: s_mul_hi_u32 s23, s19, s17
+; GCN-NEXT: s_mul_hi_u32 s23, s17, s19
; GCN-NEXT: s_addc_u32 s20, s20, s24
; GCN-NEXT: s_addc_u32 s21, s23, 0
-; GCN-NEXT: s_mul_i32 s17, s19, s17
-; GCN-NEXT: s_add_u32 s17, s20, s17
+; GCN-NEXT: s_mul_i32 s19, s17, s19
+; GCN-NEXT: s_add_u32 s19, s20, s19
; GCN-NEXT: s_addc_u32 s20, 0, s21
-; GCN-NEXT: s_add_u32 s21, s16, s17
-; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
-; GCN-NEXT: s_addc_u32 s19, s19, s20
-; GCN-NEXT: s_mul_i32 s16, s9, s19
-; GCN-NEXT: s_mul_hi_u32 s17, s9, s21
-; GCN-NEXT: s_add_i32 s16, s17, s16
-; GCN-NEXT: s_mul_i32 s18, s18, s21
-; GCN-NEXT: s_add_i32 s16, s16, s18
-; GCN-NEXT: s_mul_i32 s9, s9, s21
-; GCN-NEXT: s_mul_hi_u32 s18, s19, s9
-; GCN-NEXT: s_mul_i32 s20, s19, s9
-; GCN-NEXT: s_mul_i32 s23, s21, s16
-; GCN-NEXT: s_mul_hi_u32 s9, s21, s9
-; GCN-NEXT: s_mul_hi_u32 s22, s21, s16
+; GCN-NEXT: s_add_u32 s18, s18, s19
+; GCN-NEXT: s_addc_u32 s17, s17, s20
+; GCN-NEXT: s_mul_i32 s19, s9, s17
+; GCN-NEXT: s_mul_hi_u32 s20, s9, s18
+; GCN-NEXT: s_add_i32 s19, s20, s19
+; GCN-NEXT: s_mul_i32 s16, s16, s18
+; GCN-NEXT: s_add_i32 s19, s19, s16
+; GCN-NEXT: s_mul_i32 s9, s9, s18
+; GCN-NEXT: s_mul_hi_u32 s20, s17, s9
+; GCN-NEXT: s_mul_i32 s21, s17, s9
+; GCN-NEXT: s_mul_i32 s23, s18, s19
+; GCN-NEXT: s_mul_hi_u32 s9, s18, s9
+; GCN-NEXT: s_mul_hi_u32 s22, s18, s19
; GCN-NEXT: s_add_u32 s9, s9, s23
; GCN-NEXT: s_addc_u32 s22, 0, s22
-; GCN-NEXT: s_add_u32 s9, s9, s20
-; GCN-NEXT: s_mul_hi_u32 s17, s19, s16
-; GCN-NEXT: s_addc_u32 s9, s22, s18
-; GCN-NEXT: s_addc_u32 s17, s17, 0
-; GCN-NEXT: s_mul_i32 s16, s19, s16
-; GCN-NEXT: s_add_u32 s9, s9, s16
-; GCN-NEXT: s_addc_u32 s18, 0, s17
-; GCN-NEXT: s_add_u32 s9, s21, s9
-; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
-; GCN-NEXT: s_addc_u32 s20, s19, s18
+; GCN-NEXT: s_add_u32 s9, s9, s21
+; GCN-NEXT: s_mul_hi_u32 s16, s17, s19
+; GCN-NEXT: s_addc_u32 s9, s22, s20
+; GCN-NEXT: s_addc_u32 s16, s16, 0
+; GCN-NEXT: s_mul_i32 s19, s17, s19
+; GCN-NEXT: s_add_u32 s9, s9, s19
+; GCN-NEXT: s_addc_u32 s16, 0, s16
+; GCN-NEXT: s_add_u32 s9, s18, s9
+; GCN-NEXT: s_addc_u32 s20, s17, s16
; GCN-NEXT: s_ashr_i32 s16, s11, 31
; GCN-NEXT: s_add_u32 s18, s10, s16
; GCN-NEXT: s_mov_b32 s17, s16
@@ -5344,11 +5292,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s9, s14, s9
; GCN-NEXT: s_sub_u32 s9, s18, s9
; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s18, s22, s15
; GCN-NEXT: s_sub_u32 s24, s9, s14
; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
; GCN-NEXT: s_subb_u32 s25, s18, 0
; GCN-NEXT: s_cmp_ge_u32 s25, s15
; GCN-NEXT: s_cselect_b32 s26, -1, 0
@@ -5358,12 +5304,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s26, s27, s26
; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
; GCN-NEXT: s_subb_u32 s18, s18, s15
-; GCN-NEXT: s_sub_u32 s27, s24, s14
-; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
+; GCN-NEXT: s_sub_u32 s22, s24, s14
; GCN-NEXT: s_subb_u32 s18, s18, 0
; GCN-NEXT: s_cmp_lg_u32 s26, 0
-; GCN-NEXT: s_cselect_b32 s22, s27, s24
+; GCN-NEXT: s_cselect_b32 s22, s22, s24
; GCN-NEXT: s_cselect_b32 s18, s18, s25
; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s11, s19, s11
@@ -5420,7 +5364,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11
; GCN-NEXT: s_sub_u32 s3, 0, s10
-; GCN-NEXT: s_subb_u32 s14, 0, s11
+; GCN-NEXT: s_subb_u32 s12, 0, s11
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5429,56 +5373,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s15, v1
-; GCN-NEXT: v_readfirstlane_b32 s12, v0
-; GCN-NEXT: s_mul_i32 s13, s3, s15
-; GCN-NEXT: s_mul_hi_u32 s17, s3, s12
-; GCN-NEXT: s_mul_i32 s16, s14, s12
-; GCN-NEXT: s_add_i32 s13, s17, s13
-; GCN-NEXT: s_add_i32 s13, s13, s16
-; GCN-NEXT: s_mul_i32 s18, s3, s12
-; GCN-NEXT: s_mul_i32 s17, s12, s13
-; GCN-NEXT: s_mul_hi_u32 s19, s12, s18
-; GCN-NEXT: s_mul_hi_u32 s16, s12, s13
+; GCN-NEXT: v_readfirstlane_b32 s13, v1
+; GCN-NEXT: v_readfirstlane_b32 s14, v0
+; GCN-NEXT: s_mul_i32 s15, s3, s13
+; GCN-NEXT: s_mul_hi_u32 s17, s3, s14
+; GCN-NEXT: s_mul_i32 s16, s12, s14
+; GCN-NEXT: s_add_i32 s15, s17, s15
+; GCN-NEXT: s_add_i32 s15, s15, s16
+; GCN-NEXT: s_mul_i32 s18, s3, s14
+; GCN-NEXT: s_mul_i32 s17, s14, s15
+; GCN-NEXT: s_mul_hi_u32 s19, s14, s18
+; GCN-NEXT: s_mul_hi_u32 s16, s14, s15
; GCN-NEXT: s_add_u32 s17, s19, s17
; GCN-NEXT: s_addc_u32 s16, 0, s16
-; GCN-NEXT: s_mul_hi_u32 s20, s15, s18
-; GCN-NEXT: s_mul_i32 s18, s15, s18
+; GCN-NEXT: s_mul_hi_u32 s20, s13, s18
+; GCN-NEXT: s_mul_i32 s18, s13, s18
; GCN-NEXT: s_add_u32 s17, s17, s18
-; GCN-NEXT: s_mul_hi_u32 s19, s15, s13
+; GCN-NEXT: s_mul_hi_u32 s19, s13, s15
; GCN-NEXT: s_addc_u32 s16, s16, s20
; GCN-NEXT: s_addc_u32 s17, s19, 0
-; GCN-NEXT: s_mul_i32 s13, s15, s13
-; GCN-NEXT: s_add_u32 s13, s16, s13
+; GCN-NEXT: s_mul_i32 s15, s13, s15
+; GCN-NEXT: s_add_u32 s15, s16, s15
; GCN-NEXT: s_addc_u32 s16, 0, s17
-; GCN-NEXT: s_add_u32 s17, s12, s13
-; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT: s_addc_u32 s15, s15, s16
-; GCN-NEXT: s_mul_i32 s12, s3, s15
-; GCN-NEXT: s_mul_hi_u32 s13, s3, s17
-; GCN-NEXT: s_add_i32 s12, s13, s12
-; GCN-NEXT: s_mul_i32 s14, s14, s17
-; GCN-NEXT: s_add_i32 s12, s12, s14
-; GCN-NEXT: s_mul_i32 s3, s3, s17
-; GCN-NEXT: s_mul_hi_u32 s14, s15, s3
-; GCN-NEXT: s_mul_i32 s16, s15, s3
-; GCN-NEXT: s_mul_i32 s19, s17, s12
-; GCN-NEXT: s_mul_hi_u32 s3, s17, s3
-; GCN-NEXT: s_mul_hi_u32 s18, s17, s12
+; GCN-NEXT: s_add_u32 s14, s14, s15
+; GCN-NEXT: s_addc_u32 s13, s13, s16
+; GCN-NEXT: s_mul_i32 s15, s3, s13
+; GCN-NEXT: s_mul_hi_u32 s16, s3, s14
+; GCN-NEXT: s_add_i32 s15, s16, s15
+; GCN-NEXT: s_mul_i32 s12, s12, s14
+; GCN-NEXT: s_add_i32 s15, s15, s12
+; GCN-NEXT: s_mul_i32 s3, s3, s14
+; GCN-NEXT: s_mul_hi_u32 s16, s13, s3
+; GCN-NEXT: s_mul_i32 s17, s13, s3
+; GCN-NEXT: s_mul_i32 s19, s14, s15
+; GCN-NEXT: s_mul_hi_u32 s3, s14, s3
+; GCN-NEXT: s_mul_hi_u32 s18, s14, s15
; GCN-NEXT: s_add_u32 s3, s3, s19
; GCN-NEXT: s_addc_u32 s18, 0, s18
-; GCN-NEXT: s_add_u32 s3, s3, s16
-; GCN-NEXT: s_mul_hi_u32 s13, s15, s12
-; GCN-NEXT: s_addc_u32 s3, s18, s14
-; GCN-NEXT: s_addc_u32 s13, s13, 0
-; GCN-NEXT: s_mul_i32 s12, s15, s12
-; GCN-NEXT: s_add_u32 s3, s3, s12
-; GCN-NEXT: s_addc_u32 s14, 0, s13
-; GCN-NEXT: s_add_u32 s3, s17, s3
-; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT: s_addc_u32 s16, s15, s14
+; GCN-NEXT: s_add_u32 s3, s3, s17
+; GCN-NEXT: s_mul_hi_u32 s12, s13, s15
+; GCN-NEXT: s_addc_u32 s3, s18, s16
+; GCN-NEXT: s_addc_u32 s12, s12, 0
+; GCN-NEXT: s_mul_i32 s15, s13, s15
+; GCN-NEXT: s_add_u32 s3, s3, s15
+; GCN-NEXT: s_addc_u32 s12, 0, s12
+; GCN-NEXT: s_add_u32 s3, s14, s3
+; GCN-NEXT: s_addc_u32 s16, s13, s12
; GCN-NEXT: s_ashr_i32 s12, s5, 31
; GCN-NEXT: s_add_u32 s14, s4, s12
; GCN-NEXT: s_mov_b32 s13, s12
@@ -5507,11 +5447,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s3, s10, s3
; GCN-NEXT: s_sub_u32 s3, s14, s3
; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s14, s18, s11
; GCN-NEXT: s_sub_u32 s20, s3, s10
; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s21, s14, 0
; GCN-NEXT: s_cmp_ge_u32 s21, s11
; GCN-NEXT: s_cselect_b32 s22, -1, 0
@@ -5521,12 +5459,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s22, s23, s22
; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s14, s14, s11
-; GCN-NEXT: s_sub_u32 s23, s20, s10
-; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
+; GCN-NEXT: s_sub_u32 s18, s20, s10
; GCN-NEXT: s_subb_u32 s14, s14, 0
; GCN-NEXT: s_cmp_lg_u32 s22, 0
-; GCN-NEXT: s_cselect_b32 s18, s23, s20
+; GCN-NEXT: s_cselect_b32 s18, s18, s20
; GCN-NEXT: s_cselect_b32 s14, s14, s21
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s5, s15, s5
@@ -6299,11 +6235,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_readfirstlane_b32 s14, v8
; TONGA-NEXT: s_sub_u32 s12, s12, s14
; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s1, s1, s7
; TONGA-NEXT: s_sub_u32 s18, s12, s6
; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s19, s1, 0
; TONGA-NEXT: s_cmp_ge_u32 s19, s7
; TONGA-NEXT: s_cselect_b32 s20, -1, 0
@@ -6313,12 +6247,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: s_cselect_b32 s20, s21, s20
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s1, s1, s7
-; TONGA-NEXT: s_sub_u32 s21, s18, s6
-; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
+; TONGA-NEXT: s_sub_u32 s16, s18, s6
; TONGA-NEXT: s_subb_u32 s1, s1, 0
; TONGA-NEXT: s_cmp_lg_u32 s20, 0
-; TONGA-NEXT: s_cselect_b32 s16, s21, s18
+; TONGA-NEXT: s_cselect_b32 s16, s16, s18
; TONGA-NEXT: s_cselect_b32 s1, s1, s19
; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s3, s13, s3
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 33b0a5d..ea9bb04 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -51,10 +51,9 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_addc_u32 s13, 0, s14
; GCN-NEXT: s_add_u32 s14, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
+; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s12, s12, s13
; GCN-NEXT: s_mul_i32 s0, s10, s12
; GCN-NEXT: v_readfirstlane_b32 s1, v0
@@ -85,7 +84,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_add_u32 s11, s14, s0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s1, s12, s10
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mul_hi_u32 v1, s6, v0
@@ -115,46 +113,43 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: v_readfirstlane_b32 s10, v0
; GCN-NEXT: s_add_i32 s5, s10, s5
; GCN-NEXT: s_mul_i32 s10, s9, s4
-; GCN-NEXT: s_add_i32 s10, s5, s10
-; GCN-NEXT: s_sub_i32 s11, s7, s10
+; GCN-NEXT: s_add_i32 s12, s5, s10
+; GCN-NEXT: s_sub_i32 s10, s7, s12
; GCN-NEXT: s_mul_i32 s4, s8, s4
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s12, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s12, 0
-; GCN-NEXT: s_subb_u32 s11, s11, s9
-; GCN-NEXT: s_sub_u32 s13, s6, s8
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT: s_or_b32 s11, s4, s5
+; GCN-NEXT: s_subb_u32 s13, s10, s9
+; GCN-NEXT: s_sub_u32 s14, s6, s8
+; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT: s_or_b32 s15, s10, s11
+; GCN-NEXT: s_subb_u32 s15, s13, 0
+; GCN-NEXT: s_cmp_ge_u32 s15, s9
+; GCN-NEXT: s_cselect_b32 s16, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s14, s8
+; GCN-NEXT: s_cselect_b32 s17, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s15, s9
+; GCN-NEXT: s_cselect_b32 s16, s17, s16
+; GCN-NEXT: s_or_b32 s10, s10, s11
+; GCN-NEXT: s_subb_u32 s13, s13, s9
+; GCN-NEXT: s_sub_u32 s17, s14, s8
+; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT: s_or_b32 s10, s10, s11
+; GCN-NEXT: s_subb_u32 s10, s13, 0
+; GCN-NEXT: s_cmp_lg_u32 s16, 0
+; GCN-NEXT: s_cselect_b32 s11, s17, s14
+; GCN-NEXT: s_cselect_b32 s10, s10, s15
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_subb_u32 s14, s11, 0
-; GCN-NEXT: s_cmp_ge_u32 s14, s9
+; GCN-NEXT: s_subb_u32 s4, s7, s12
+; GCN-NEXT: s_cmp_ge_u32 s4, s9
; GCN-NEXT: s_cselect_b32 s5, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s13, s8
-; GCN-NEXT: s_cselect_b32 s15, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s14, s9
-; GCN-NEXT: s_cselect_b32 s15, s15, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_subb_u32 s11, s11, s9
-; GCN-NEXT: s_sub_u32 s16, s13, s8
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_subb_u32 s4, s11, 0
-; GCN-NEXT: s_cmp_lg_u32 s15, 0
-; GCN-NEXT: s_cselect_b32 s5, s16, s13
-; GCN-NEXT: s_cselect_b32 s4, s4, s14
-; GCN-NEXT: s_cmp_lg_u32 s12, 0
-; GCN-NEXT: s_subb_u32 s7, s7, s10
-; GCN-NEXT: s_cmp_ge_u32 s7, s9
-; GCN-NEXT: s_cselect_b32 s10, -1, 0
; GCN-NEXT: s_cmp_ge_u32 s6, s8
-; GCN-NEXT: s_cselect_b32 s8, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s7, s9
-; GCN-NEXT: s_cselect_b32 s8, s8, s10
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
-; GCN-NEXT: s_cselect_b32 s4, s4, s7
-; GCN-NEXT: s_cselect_b32 s5, s5, s6
+; GCN-NEXT: s_cselect_b32 s7, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s4, s9
+; GCN-NEXT: s_cselect_b32 s5, s7, s5
+; GCN-NEXT: s_cmp_lg_u32 s5, 0
+; GCN-NEXT: s_cselect_b32 s4, s10, s4
+; GCN-NEXT: s_cselect_b32 s5, s11, s6
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -187,7 +182,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_or_b32 s8, s8, s9
-; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
@@ -221,7 +215,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_add_u32 s16, s16, 1
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_or_b32 s18, s18, s19
-; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0
; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
@@ -1016,10 +1009,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-NEXT: s_addc_u32 s13, 0, s14
; GCN-NEXT: s_add_u32 s14, s8, s9
; GCN-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
+; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: s_addc_u32 s12, s12, s13
; GCN-NEXT: s_mul_i32 s8, s10, s12
; GCN-NEXT: v_readfirstlane_b32 s9, v0
@@ -1050,7 +1042,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-NEXT: s_add_u32 s11, s14, s8
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: s_addc_u32 s10, s12, s10
; GCN-NEXT: s_ashr_i32 s8, s7, 31
; GCN-NEXT: s_add_u32 s6, s6, s8
@@ -1083,46 +1074,43 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-NEXT: v_readfirstlane_b32 s12, v0
; GCN-NEXT: s_add_i32 s11, s12, s11
; GCN-NEXT: s_mul_i32 s12, s5, s10
-; GCN-NEXT: s_add_i32 s12, s11, s12
-; GCN-NEXT: s_sub_i32 s13, s7, s12
+; GCN-NEXT: s_add_i32 s14, s11, s12
+; GCN-NEXT: s_sub_i32 s12, s7, s14
; GCN-NEXT: s_mul_i32 s10, s4, s10
; GCN-NEXT: s_sub_u32 s6, s6, s10
; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s14, s10, s11
-; GCN-NEXT: s_cmp_lg_u32 s14, 0
-; GCN-NEXT: s_subb_u32 s13, s13, s5
-; GCN-NEXT: s_sub_u32 s15, s6, s4
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_cmp_lg_u32 s10, 0
-; GCN-NEXT: s_subb_u32 s16, s13, 0
-; GCN-NEXT: s_cmp_ge_u32 s16, s5
-; GCN-NEXT: s_cselect_b32 s11, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s15, s4
-; GCN-NEXT: s_cselect_b32 s17, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s16, s5
-; GCN-NEXT: s_cselect_b32 s17, s17, s11
-; GCN-NEXT: s_cmp_lg_u32 s10, 0
-; GCN-NEXT: s_subb_u32 s13, s13, s5
-; GCN-NEXT: s_sub_u32 s18, s15, s4
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT: s_or_b32 s13, s10, s11
+; GCN-NEXT: s_subb_u32 s15, s12, s5
+; GCN-NEXT: s_sub_u32 s16, s6, s4
+; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT: s_or_b32 s17, s12, s13
+; GCN-NEXT: s_subb_u32 s17, s15, 0
+; GCN-NEXT: s_cmp_ge_u32 s17, s5
+; GCN-NEXT: s_cselect_b32 s18, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s16, s4
+; GCN-NEXT: s_cselect_b32 s19, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s17, s5
+; GCN-NEXT: s_cselect_b32 s18, s19, s18
+; GCN-NEXT: s_or_b32 s12, s12, s13
+; GCN-NEXT: s_subb_u32 s15, s15, s5
+; GCN-NEXT: s_sub_u32 s19, s16, s4
+; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT: s_or_b32 s12, s12, s13
+; GCN-NEXT: s_subb_u32 s12, s15, 0
+; GCN-NEXT: s_cmp_lg_u32 s18, 0
+; GCN-NEXT: s_cselect_b32 s13, s19, s16
+; GCN-NEXT: s_cselect_b32 s12, s12, s17
; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_cmp_lg_u32 s10, 0
-; GCN-NEXT: s_subb_u32 s10, s13, 0
-; GCN-NEXT: s_cmp_lg_u32 s17, 0
-; GCN-NEXT: s_cselect_b32 s11, s18, s15
-; GCN-NEXT: s_cselect_b32 s10, s10, s16
-; GCN-NEXT: s_cmp_lg_u32 s14, 0
-; GCN-NEXT: s_subb_u32 s7, s7, s12
+; GCN-NEXT: s_subb_u32 s7, s7, s14
; GCN-NEXT: s_cmp_ge_u32 s7, s5
-; GCN-NEXT: s_cselect_b32 s12, -1, 0
+; GCN-NEXT: s_cselect_b32 s10, -1, 0
; GCN-NEXT: s_cmp_ge_u32 s6, s4
; GCN-NEXT: s_cselect_b32 s4, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s7, s5
-; GCN-NEXT: s_cselect_b32 s4, s4, s12
+; GCN-NEXT: s_cselect_b32 s4, s4, s10
; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_cselect_b32 s5, s10, s7
-; GCN-NEXT: s_cselect_b32 s4, s11, s6
+; GCN-NEXT: s_cselect_b32 s5, s12, s7
+; GCN-NEXT: s_cselect_b32 s4, s13, s6
; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
; GCN-NEXT: s_sub_u32 s4, s4, s8
; GCN-NEXT: s_subb_u32 s5, s5, s8
@@ -1170,7 +1158,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-IR-NEXT: s_add_u32 s16, s14, 1
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_or_b32 s10, s10, s11
-; GCN-IR-NEXT: s_cmp_lg_u32 s10, 0
; GCN-IR-NEXT: s_addc_u32 s10, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_sub_i32 s14, 63, s14
@@ -1204,7 +1191,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-IR-NEXT: s_add_u32 s18, s18, 1
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
; GCN-IR-NEXT: s_or_b32 s20, s20, s21
-; GCN-IR-NEXT: s_cmp_lg_u32 s20, 0
; GCN-IR-NEXT: s_addc_u32 s19, s19, 0
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3]
@@ -1369,10 +1355,9 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_addc_u32 s10, 0, s11
; GCN-NEXT: s_add_u32 s11, s6, s7
; GCN-NEXT: v_mov_b32_e32 v0, s11
-; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
+; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-NEXT: s_or_b32 s6, s6, s7
-; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_addc_u32 s9, s9, s10
; GCN-NEXT: s_mul_i32 s6, s2, s9
; GCN-NEXT: v_readfirstlane_b32 s7, v0
@@ -1403,7 +1388,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_add_u32 s2, s11, s2
; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-NEXT: s_or_b32 s6, s6, s7
-; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_addc_u32 s6, s9, s8
; GCN-NEXT: v_mul_hi_u32 v1, s2, 24
; GCN-NEXT: v_mul_hi_u32 v0, s6, 24
@@ -1418,45 +1402,42 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_mul_i32 s7, s5, s6
; GCN-NEXT: s_mul_i32 s6, s4, s6
; GCN-NEXT: v_readfirstlane_b32 s8, v0
-; GCN-NEXT: s_add_i32 s8, s8, s7
-; GCN-NEXT: s_sub_i32 s9, 0, s8
-; GCN-NEXT: s_sub_u32 s10, 24, s6
-; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT: s_or_b32 s11, s6, s7
-; GCN-NEXT: s_cmp_lg_u32 s11, 0
-; GCN-NEXT: s_subb_u32 s9, s9, s5
-; GCN-NEXT: s_sub_u32 s12, s10, s4
+; GCN-NEXT: s_add_i32 s10, s8, s7
+; GCN-NEXT: s_sub_i32 s8, 0, s10
+; GCN-NEXT: s_sub_u32 s11, 24, s6
; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
+; GCN-NEXT: s_or_b32 s9, s6, s7
+; GCN-NEXT: s_subb_u32 s12, s8, s5
+; GCN-NEXT: s_sub_u32 s13, s11, s4
+; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT: s_or_b32 s14, s8, s9
+; GCN-NEXT: s_subb_u32 s14, s12, 0
+; GCN-NEXT: s_cmp_ge_u32 s14, s5
+; GCN-NEXT: s_cselect_b32 s15, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s13, s4
+; GCN-NEXT: s_cselect_b32 s16, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s14, s5
+; GCN-NEXT: s_cselect_b32 s15, s16, s15
+; GCN-NEXT: s_or_b32 s8, s8, s9
+; GCN-NEXT: s_subb_u32 s12, s12, s5
+; GCN-NEXT: s_sub_u32 s16, s13, s4
+; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT: s_or_b32 s8, s8, s9
+; GCN-NEXT: s_subb_u32 s8, s12, 0
+; GCN-NEXT: s_cmp_lg_u32 s15, 0
+; GCN-NEXT: s_cselect_b32 s9, s16, s13
+; GCN-NEXT: s_cselect_b32 s8, s8, s14
; GCN-NEXT: s_or_b32 s6, s6, s7
-; GCN-NEXT: s_cmp_lg_u32 s6, 0
-; GCN-NEXT: s_subb_u32 s13, s9, 0
-; GCN-NEXT: s_cmp_ge_u32 s13, s5
+; GCN-NEXT: s_subb_u32 s6, 0, s10
+; GCN-NEXT: s_cmp_ge_u32 s6, s5
; GCN-NEXT: s_cselect_b32 s7, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s12, s4
-; GCN-NEXT: s_cselect_b32 s14, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s13, s5
-; GCN-NEXT: s_cselect_b32 s14, s14, s7
-; GCN-NEXT: s_cmp_lg_u32 s6, 0
-; GCN-NEXT: s_subb_u32 s9, s9, s5
-; GCN-NEXT: s_sub_u32 s15, s12, s4
-; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT: s_or_b32 s6, s6, s7
-; GCN-NEXT: s_cmp_lg_u32 s6, 0
-; GCN-NEXT: s_subb_u32 s6, s9, 0
-; GCN-NEXT: s_cmp_lg_u32 s14, 0
-; GCN-NEXT: s_cselect_b32 s7, s15, s12
-; GCN-NEXT: s_cselect_b32 s6, s6, s13
-; GCN-NEXT: s_cmp_lg_u32 s11, 0
-; GCN-NEXT: s_subb_u32 s8, 0, s8
-; GCN-NEXT: s_cmp_ge_u32 s8, s5
-; GCN-NEXT: s_cselect_b32 s9, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s10, s4
+; GCN-NEXT: s_cmp_ge_u32 s11, s4
; GCN-NEXT: s_cselect_b32 s4, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s8, s5
-; GCN-NEXT: s_cselect_b32 s4, s4, s9
+; GCN-NEXT: s_cmp_eq_u32 s6, s5
+; GCN-NEXT: s_cselect_b32 s4, s4, s7
; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_cselect_b32 s4, s6, s8
-; GCN-NEXT: s_cselect_b32 s5, s7, s10
+; GCN-NEXT: s_cselect_b32 s4, s8, s6
+; GCN-NEXT: s_cselect_b32 s5, s9, s11
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1489,7 +1470,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s8, s2, 1
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_or_b32 s9, s10, s11
-; GCN-IR-NEXT: s_cmp_lg_u32 s9, 0
; GCN-IR-NEXT: s_addc_u32 s3, s3, 0
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_sub_i32 s2, 63, s2
@@ -1522,7 +1502,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_or_b32 s16, s16, s17
-; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/stackguard.ll b/llvm/test/CodeGen/AMDGPU/stackguard.ll
new file mode 100644
index 0000000..393686f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/stackguard.ll
@@ -0,0 +1,14 @@
+; RUN: not llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
+; RUN: not llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
+
+; FIXME: To actually support stackguard, need to fix intrinsic to
+; return pointer in any address space.
+
+; CHECK: error: unable to lower stackguard
+define i1 @test_stackguard(ptr %p1) {
+ %p2 = call ptr @llvm.stackguard()
+ %res = icmp ne ptr %p2, %p1
+ ret i1 %res
+}
+
+declare ptr @llvm.stackguard()
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index bb5918b2..bdd22f25 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -18,7 +18,6 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: s_or_b32 s0, s0, s1
-; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_addc_u32 s3, s3, s9
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -35,10 +34,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_add_u32 s2, s2, s4
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
; VI-NEXT: s_addc_u32 s3, s3, s5
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s3
@@ -53,14 +50,12 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s6, s2, s6
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s4, s3, s7
+; GFX9-NEXT: s_add_u32 s4, s2, s6
+; GFX9-NEXT: s_addc_u32 s5, s3, s7
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
@@ -73,8 +68,6 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s2, s2, s6
-; GFX10-NEXT: s_cselect_b32 s4, -1, 0
-; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: s_addc_u32 s3, s3, s7
; GFX10-NEXT: s_cselect_b32 s4, -1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
@@ -91,14 +84,12 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s2, s2, s4
-; GFX11-NEXT: s_cselect_b32 s4, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-NEXT: s_addc_u32 s3, s3, s5
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
@@ -444,7 +435,6 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_add_u32 s4, s4, s6
; SI-NEXT: s_cselect_b64 s[12:13], -1, 0
; SI-NEXT: s_or_b32 s6, s12, s13
-; SI-NEXT: s_cmp_lg_u32 s6, 0
; SI-NEXT: s_addc_u32 s5, s5, s7
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
@@ -465,16 +455,14 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: s_add_u32 s2, s4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_add_u32 s0, s4, s6
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_addc_u32 s1, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT: s_addc_u32 s0, s5, s7
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: v_mov_b32_e32 v5, s0
-; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -486,12 +474,10 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s2, s12, s14
-; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: s_addc_u32 s0, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_add_u32 s0, s12, s14
+; GFX9-NEXT: s_addc_u32 s1, s13, s15
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -504,10 +490,8 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s0, s12, s14
-; GFX10-NEXT: s_cselect_b32 s1, -1, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: s_cmp_lg_u32 s1, 0
; GFX10-NEXT: s_addc_u32 s1, s13, s15
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: s_cselect_b32 s0, -1, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -520,10 +504,8 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s4, s4, s6
-; GFX11-NEXT: s_cselect_b32 s6, -1, 0
-; GFX11-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-NEXT: s_cmp_lg_u32 s6, 0
; GFX11-NEXT: s_addc_u32 s5, s5, s7
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 41199b0..fd461ac 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -148,7 +148,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_or_b32 s8, s8, s9
-; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
@@ -182,7 +181,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_add_u32 s10, s10, 1
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_or_b32 s16, s16, s17
-; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[2:3], s[4:5]
@@ -831,10 +829,9 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_addc_u32 s10, 0, s11
; GCN-NEXT: s_add_u32 s11, s4, s5
; GCN-NEXT: v_mov_b32_e32 v0, s11
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
+; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_addc_u32 s9, s9, s10
; GCN-NEXT: s_mul_i32 s4, s6, s9
; GCN-NEXT: v_readfirstlane_b32 s5, v0
@@ -865,7 +862,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_add_u32 s8, s11, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_addc_u32 s4, s9, s6
; GCN-NEXT: v_mul_hi_u32 v1, s8, 24
; GCN-NEXT: v_mul_hi_u32 v0, s4, 24
@@ -874,52 +870,50 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_readfirstlane_b32 s8, v1
; GCN-NEXT: v_readfirstlane_b32 s5, v0
; GCN-NEXT: s_add_u32 s4, s8, s4
-; GCN-NEXT: s_addc_u32 s8, 0, s5
-; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: s_addc_u32 s10, 0, s5
+; GCN-NEXT: v_mov_b32_e32 v0, s10
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: s_mul_i32 s0, s3, s8
+; GCN-NEXT: s_mul_i32 s0, s3, s10
; GCN-NEXT: v_readfirstlane_b32 s1, v0
-; GCN-NEXT: s_add_i32 s9, s1, s0
-; GCN-NEXT: s_sub_i32 s10, 0, s9
-; GCN-NEXT: s_mul_i32 s0, s2, s8
-; GCN-NEXT: s_sub_u32 s11, 24, s0
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s12, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s12, 0
-; GCN-NEXT: s_subb_u32 s10, s10, s3
-; GCN-NEXT: s_sub_u32 s13, s11, s2
+; GCN-NEXT: s_add_i32 s11, s1, s0
+; GCN-NEXT: s_sub_i32 s8, 0, s11
+; GCN-NEXT: s_mul_i32 s0, s2, s10
+; GCN-NEXT: s_sub_u32 s12, 24, s0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT: s_or_b32 s9, s0, s1
+; GCN-NEXT: s_subb_u32 s13, s8, s3
+; GCN-NEXT: s_sub_u32 s14, s12, s2
+; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT: s_or_b32 s8, s8, s9
+; GCN-NEXT: s_subb_u32 s8, s13, 0
+; GCN-NEXT: s_cmp_ge_u32 s8, s3
+; GCN-NEXT: s_cselect_b32 s9, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s14, s2
+; GCN-NEXT: s_cselect_b32 s13, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s8, s3
+; GCN-NEXT: s_cselect_b32 s8, s13, s9
+; GCN-NEXT: s_add_u32 s9, s10, 1
+; GCN-NEXT: s_addc_u32 s13, 0, 0
+; GCN-NEXT: s_add_u32 s14, s10, 2
+; GCN-NEXT: s_addc_u32 s15, 0, 0
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
+; GCN-NEXT: s_cselect_b32 s8, s14, s9
+; GCN-NEXT: s_cselect_b32 s9, s15, s13
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
-; GCN-NEXT: s_subb_u32 s0, s10, 0
+; GCN-NEXT: s_subb_u32 s0, 0, s11
; GCN-NEXT: s_cmp_ge_u32 s0, s3
; GCN-NEXT: s_cselect_b32 s1, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s13, s2
-; GCN-NEXT: s_cselect_b32 s10, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s12, s2
+; GCN-NEXT: s_cselect_b32 s2, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s0, s3
-; GCN-NEXT: s_cselect_b32 s0, s10, s1
-; GCN-NEXT: s_add_u32 s1, s8, 1
-; GCN-NEXT: s_addc_u32 s10, 0, 0
-; GCN-NEXT: s_add_u32 s13, s8, 2
-; GCN-NEXT: s_addc_u32 s14, 0, 0
+; GCN-NEXT: s_cselect_b32 s0, s2, s1
; GCN-NEXT: s_cmp_lg_u32 s0, 0
-; GCN-NEXT: s_cselect_b32 s0, s13, s1
-; GCN-NEXT: s_cselect_b32 s1, s14, s10
-; GCN-NEXT: s_cmp_lg_u32 s12, 0
-; GCN-NEXT: s_subb_u32 s9, 0, s9
-; GCN-NEXT: s_cmp_ge_u32 s9, s3
-; GCN-NEXT: s_cselect_b32 s10, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s11, s2
-; GCN-NEXT: s_cselect_b32 s2, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s9, s3
-; GCN-NEXT: s_cselect_b32 s2, s2, s10
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
-; GCN-NEXT: s_cselect_b32 s1, s1, 0
-; GCN-NEXT: s_cselect_b32 s0, s0, s8
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: s_cselect_b32 s0, s9, 0
+; GCN-NEXT: s_cselect_b32 s1, s8, s10
+; GCN-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
@@ -945,7 +939,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s10, s8, 1
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_or_b32 s6, s6, s7
-; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -978,7 +971,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_or_b32 s16, s16, s17
-; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
@@ -1317,7 +1309,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s11, s8, 1
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_or_b32 s6, s6, s7
-; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -1347,7 +1338,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s10, s10, 1
; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0
; GCN-IR-NEXT: s_or_b32 s12, s12, s13
-; GCN-IR-NEXT: s_cmp_lg_u32 s12, 0
; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
index 9bcba6c..2d7ce10 100644
--- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
define amdgpu_kernel void @uitofp_i16_to_f16(
; SI-LABEL: uitofp_i16_to_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index cdcc914..137dc1f 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -51,10 +51,9 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: s_addc_u32 s13, 0, s14
; GCN-NEXT: s_add_u32 s14, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
+; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s12, s12, s13
; GCN-NEXT: s_mul_i32 s0, s10, s12
; GCN-NEXT: v_readfirstlane_b32 s1, v0
@@ -85,7 +84,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: s_add_u32 s11, s14, s0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s1, s12, s10
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mul_hi_u32 v1, s6, v0
@@ -115,46 +113,43 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: v_readfirstlane_b32 s10, v0
; GCN-NEXT: s_add_i32 s5, s10, s5
; GCN-NEXT: s_mul_i32 s10, s9, s4
-; GCN-NEXT: s_add_i32 s10, s5, s10
-; GCN-NEXT: s_sub_i32 s11, s7, s10
+; GCN-NEXT: s_add_i32 s12, s5, s10
+; GCN-NEXT: s_sub_i32 s10, s7, s12
; GCN-NEXT: s_mul_i32 s4, s8, s4
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s12, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s12, 0
-; GCN-NEXT: s_subb_u32 s11, s11, s9
-; GCN-NEXT: s_sub_u32 s13, s6, s8
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT: s_or_b32 s11, s4, s5
+; GCN-NEXT: s_subb_u32 s13, s10, s9
+; GCN-NEXT: s_sub_u32 s14, s6, s8
+; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT: s_or_b32 s15, s10, s11
+; GCN-NEXT: s_subb_u32 s15, s13, 0
+; GCN-NEXT: s_cmp_ge_u32 s15, s9
+; GCN-NEXT: s_cselect_b32 s16, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s14, s8
+; GCN-NEXT: s_cselect_b32 s17, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s15, s9
+; GCN-NEXT: s_cselect_b32 s16, s17, s16
+; GCN-NEXT: s_or_b32 s10, s10, s11
+; GCN-NEXT: s_subb_u32 s13, s13, s9
+; GCN-NEXT: s_sub_u32 s17, s14, s8
+; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT: s_or_b32 s10, s10, s11
+; GCN-NEXT: s_subb_u32 s10, s13, 0
+; GCN-NEXT: s_cmp_lg_u32 s16, 0
+; GCN-NEXT: s_cselect_b32 s11, s17, s14
+; GCN-NEXT: s_cselect_b32 s10, s10, s15
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_subb_u32 s14, s11, 0
-; GCN-NEXT: s_cmp_ge_u32 s14, s9
+; GCN-NEXT: s_subb_u32 s4, s7, s12
+; GCN-NEXT: s_cmp_ge_u32 s4, s9
; GCN-NEXT: s_cselect_b32 s5, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s13, s8
-; GCN-NEXT: s_cselect_b32 s15, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s14, s9
-; GCN-NEXT: s_cselect_b32 s15, s15, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_subb_u32 s11, s11, s9
-; GCN-NEXT: s_sub_u32 s16, s13, s8
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_subb_u32 s4, s11, 0
-; GCN-NEXT: s_cmp_lg_u32 s15, 0
-; GCN-NEXT: s_cselect_b32 s5, s16, s13
-; GCN-NEXT: s_cselect_b32 s4, s4, s14
-; GCN-NEXT: s_cmp_lg_u32 s12, 0
-; GCN-NEXT: s_subb_u32 s7, s7, s10
-; GCN-NEXT: s_cmp_ge_u32 s7, s9
-; GCN-NEXT: s_cselect_b32 s10, -1, 0
; GCN-NEXT: s_cmp_ge_u32 s6, s8
-; GCN-NEXT: s_cselect_b32 s8, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s7, s9
-; GCN-NEXT: s_cselect_b32 s8, s8, s10
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
-; GCN-NEXT: s_cselect_b32 s4, s4, s7
-; GCN-NEXT: s_cselect_b32 s5, s5, s6
+; GCN-NEXT: s_cselect_b32 s7, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s4, s9
+; GCN-NEXT: s_cselect_b32 s5, s7, s5
+; GCN-NEXT: s_cmp_lg_u32 s5, 0
+; GCN-NEXT: s_cselect_b32 s4, s10, s4
+; GCN-NEXT: s_cselect_b32 s5, s11, s6
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -187,7 +182,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_or_b32 s8, s8, s9
-; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
@@ -221,7 +215,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_add_u32 s16, s16, 1
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_or_b32 s18, s18, s19
-; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0
; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
@@ -853,10 +846,9 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_addc_u32 s10, 0, s11
; GCN-NEXT: s_add_u32 s11, s4, s5
; GCN-NEXT: v_mov_b32_e32 v0, s11
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
+; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_addc_u32 s9, s9, s10
; GCN-NEXT: s_mul_i32 s4, s6, s9
; GCN-NEXT: v_readfirstlane_b32 s5, v0
@@ -887,7 +879,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_add_u32 s8, s11, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_addc_u32 s4, s9, s6
; GCN-NEXT: v_mul_hi_u32 v1, s8, 24
; GCN-NEXT: v_mul_hi_u32 v0, s4, 24
@@ -903,46 +894,43 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: s_mul_i32 s0, s3, s8
; GCN-NEXT: v_readfirstlane_b32 s1, v0
-; GCN-NEXT: s_add_i32 s9, s1, s0
-; GCN-NEXT: s_sub_i32 s10, 0, s9
+; GCN-NEXT: s_add_i32 s10, s1, s0
+; GCN-NEXT: s_sub_i32 s9, 0, s10
; GCN-NEXT: s_mul_i32 s0, s2, s8
-; GCN-NEXT: s_sub_u32 s8, 24, s0
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s11, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s11, 0
-; GCN-NEXT: s_subb_u32 s10, s10, s3
-; GCN-NEXT: s_sub_u32 s12, s8, s2
+; GCN-NEXT: s_sub_u32 s11, 24, s0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT: s_or_b32 s8, s0, s1
+; GCN-NEXT: s_subb_u32 s12, s9, s3
+; GCN-NEXT: s_sub_u32 s13, s11, s2
+; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT: s_or_b32 s14, s8, s9
+; GCN-NEXT: s_subb_u32 s14, s12, 0
+; GCN-NEXT: s_cmp_ge_u32 s14, s3
+; GCN-NEXT: s_cselect_b32 s15, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s13, s2
+; GCN-NEXT: s_cselect_b32 s16, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s14, s3
+; GCN-NEXT: s_cselect_b32 s15, s16, s15
+; GCN-NEXT: s_or_b32 s8, s8, s9
+; GCN-NEXT: s_subb_u32 s12, s12, s3
+; GCN-NEXT: s_sub_u32 s16, s13, s2
+; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT: s_or_b32 s8, s8, s9
+; GCN-NEXT: s_subb_u32 s8, s12, 0
+; GCN-NEXT: s_cmp_lg_u32 s15, 0
+; GCN-NEXT: s_cselect_b32 s9, s16, s13
+; GCN-NEXT: s_cselect_b32 s8, s8, s14
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
-; GCN-NEXT: s_subb_u32 s13, s10, 0
-; GCN-NEXT: s_cmp_ge_u32 s13, s3
+; GCN-NEXT: s_subb_u32 s0, 0, s10
+; GCN-NEXT: s_cmp_ge_u32 s0, s3
; GCN-NEXT: s_cselect_b32 s1, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s12, s2
-; GCN-NEXT: s_cselect_b32 s14, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s13, s3
-; GCN-NEXT: s_cselect_b32 s14, s14, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
-; GCN-NEXT: s_subb_u32 s10, s10, s3
-; GCN-NEXT: s_sub_u32 s15, s12, s2
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
-; GCN-NEXT: s_subb_u32 s0, s10, 0
-; GCN-NEXT: s_cmp_lg_u32 s14, 0
-; GCN-NEXT: s_cselect_b32 s1, s15, s12
-; GCN-NEXT: s_cselect_b32 s0, s0, s13
-; GCN-NEXT: s_cmp_lg_u32 s11, 0
-; GCN-NEXT: s_subb_u32 s9, 0, s9
-; GCN-NEXT: s_cmp_ge_u32 s9, s3
-; GCN-NEXT: s_cselect_b32 s10, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s8, s2
+; GCN-NEXT: s_cmp_ge_u32 s11, s2
; GCN-NEXT: s_cselect_b32 s2, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s9, s3
-; GCN-NEXT: s_cselect_b32 s2, s2, s10
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
-; GCN-NEXT: s_cselect_b32 s0, s0, s9
-; GCN-NEXT: s_cselect_b32 s1, s1, s8
+; GCN-NEXT: s_cmp_eq_u32 s0, s3
+; GCN-NEXT: s_cselect_b32 s1, s2, s1
+; GCN-NEXT: s_cmp_lg_u32 s1, 0
+; GCN-NEXT: s_cselect_b32 s0, s8, s0
+; GCN-NEXT: s_cselect_b32 s1, s9, s11
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -970,7 +958,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s10, s8, 1
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_or_b32 s6, s6, s7
-; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -1003,7 +990,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_or_b32 s16, s16, s17
-; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
@@ -1093,7 +1079,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s11, s8, 1
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_or_b32 s6, s6, s7
-; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -1123,7 +1108,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s12, s12, 1
; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0
; GCN-IR-NEXT: s_or_b32 s14, s14, s15
-; GCN-IR-NEXT: s_cmp_lg_u32 s14, 0
; GCN-IR-NEXT: s_addc_u32 s13, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index d67a7b1..e8db647 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -18,7 +18,6 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: s_or_b32 s0, s0, s1
-; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_subb_u32 s3, s3, s9
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -35,10 +34,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_sub_u32 s2, s2, s4
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
; VI-NEXT: s_subb_u32 s3, s3, s5
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s3
@@ -53,14 +50,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s6, s2, s6
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_subb_u32 s4, s3, s7
+; GFX9-NEXT: s_sub_u32 s4, s2, s6
+; GFX9-NEXT: s_subb_u32 s5, s3, s7
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
@@ -73,8 +68,6 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_sub_u32 s2, s2, s6
-; GFX10-NEXT: s_cselect_b32 s4, -1, 0
-; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: s_subb_u32 s3, s3, s7
; GFX10-NEXT: s_cselect_b32 s4, -1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
@@ -91,14 +84,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_u32 s2, s2, s4
-; GFX11-NEXT: s_cselect_b32 s4, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-NEXT: s_subb_u32 s3, s3, s5
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
@@ -443,7 +434,6 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_sub_u32 s4, s4, s6
; SI-NEXT: s_cselect_b64 s[12:13], -1, 0
; SI-NEXT: s_or_b32 s6, s12, s13
-; SI-NEXT: s_cmp_lg_u32 s6, 0
; SI-NEXT: s_subb_u32 s5, s5, s7
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
@@ -464,16 +454,14 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: s_sub_u32 s2, s4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_sub_u32 s0, s4, s6
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_subb_u32 s1, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT: s_subb_u32 s0, s5, s7
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: v_mov_b32_e32 v5, s0
-; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -485,12 +473,10 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s2, s12, s14
-; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: s_subb_u32 s0, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_sub_u32 s0, s12, s14
+; GFX9-NEXT: s_subb_u32 s1, s13, s15
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -503,10 +489,8 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_sub_u32 s0, s12, s14
-; GFX10-NEXT: s_cselect_b32 s1, -1, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: s_cmp_lg_u32 s1, 0
; GFX10-NEXT: s_subb_u32 s1, s13, s15
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: s_cselect_b32 s0, -1, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -519,10 +503,8 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_u32 s4, s4, s6
-; GFX11-NEXT: s_cselect_b32 s6, -1, 0
-; GFX11-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-NEXT: s_cmp_lg_u32 s6, 0
; GFX11-NEXT: s_subb_u32 s5, s5, s7
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
index f3cb5a7..30f5277 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
@@ -26,17 +26,17 @@ define amdgpu_kernel void @barrier_vmcnt_global(ptr addrspace(1) %arg) {
; GFX9-LABEL: barrier_vmcnt_global:
; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v1, s[0:1]
-; GFX9-NEXT: v_add_u32_e32 v1, 1, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: global_load_dword v3, v1, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_barrier
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: global_store_dword v[0:1], v3, off
; GFX9-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -369,10 +369,9 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) {
; GFX8-NEXT: flat_load_dword v3, v[2:3]
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0
; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
-; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_barrier
; GFX8-NEXT: flat_store_dword v[0:1], v3
; GFX8-NEXT: s_endpgm
@@ -393,10 +392,9 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) {
; GFX9-NEXT: flat_load_dword v3, v[2:3]
; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_barrier
; GFX9-NEXT: flat_store_dword v[0:1], v3
; GFX9-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 75db387..28c6b40 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -774,44 +774,40 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: s_add_u32 s11, s12, s11
; GFX1032-NEXT: s_addc_u32 s12, 0, s13
; GFX1032-NEXT: s_add_u32 s8, s8, s11
-; GFX1032-NEXT: s_cselect_b32 s11, -1, 0
-; GFX1032-NEXT: s_mul_hi_u32 s13, s9, s8
-; GFX1032-NEXT: s_cmp_lg_u32 s11, 0
-; GFX1032-NEXT: s_mul_i32 s11, s9, s8
; GFX1032-NEXT: s_addc_u32 s5, s5, s12
-; GFX1032-NEXT: s_mul_i32 s10, s10, s8
+; GFX1032-NEXT: s_mul_hi_u32 s11, s9, s8
+; GFX1032-NEXT: s_mul_i32 s12, s9, s8
; GFX1032-NEXT: s_mul_i32 s9, s9, s5
-; GFX1032-NEXT: s_mul_hi_u32 s12, s8, s11
-; GFX1032-NEXT: s_add_i32 s9, s13, s9
-; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s11
+; GFX1032-NEXT: s_mul_i32 s10, s10, s8
+; GFX1032-NEXT: s_add_i32 s9, s11, s9
+; GFX1032-NEXT: s_mul_i32 s11, s5, s12
; GFX1032-NEXT: s_add_i32 s9, s9, s10
-; GFX1032-NEXT: s_mul_i32 s10, s5, s11
+; GFX1032-NEXT: s_mul_hi_u32 s10, s8, s12
; GFX1032-NEXT: s_mul_i32 s15, s8, s9
; GFX1032-NEXT: s_mul_hi_u32 s14, s8, s9
-; GFX1032-NEXT: s_add_u32 s12, s12, s15
+; GFX1032-NEXT: s_add_u32 s10, s10, s15
+; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s12
; GFX1032-NEXT: s_addc_u32 s14, 0, s14
-; GFX1032-NEXT: s_mul_hi_u32 s11, s5, s9
-; GFX1032-NEXT: s_add_u32 s10, s12, s10
+; GFX1032-NEXT: s_mul_hi_u32 s12, s5, s9
+; GFX1032-NEXT: s_add_u32 s10, s10, s11
; GFX1032-NEXT: s_mul_i32 s9, s5, s9
; GFX1032-NEXT: s_addc_u32 s10, s14, s13
-; GFX1032-NEXT: s_addc_u32 s11, s11, 0
+; GFX1032-NEXT: s_addc_u32 s11, s12, 0
; GFX1032-NEXT: s_add_u32 s9, s10, s9
; GFX1032-NEXT: s_addc_u32 s10, 0, s11
; GFX1032-NEXT: s_add_u32 s8, s8, s9
-; GFX1032-NEXT: s_cselect_b32 s9, -1, 0
-; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s8
-; GFX1032-NEXT: s_cmp_lg_u32 s9, 0
-; GFX1032-NEXT: s_mul_hi_u32 s9, s3, s8
; GFX1032-NEXT: s_addc_u32 s5, s5, s10
-; GFX1032-NEXT: s_mul_i32 s8, s3, s8
+; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s8
; GFX1032-NEXT: s_mul_i32 s12, s2, s5
-; GFX1032-NEXT: s_mul_hi_u32 s10, s2, s5
-; GFX1032-NEXT: s_add_u32 s11, s11, s12
-; GFX1032-NEXT: s_addc_u32 s10, 0, s10
+; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s5
+; GFX1032-NEXT: s_mul_hi_u32 s10, s3, s8
+; GFX1032-NEXT: s_mul_i32 s8, s3, s8
+; GFX1032-NEXT: s_add_u32 s9, s9, s12
+; GFX1032-NEXT: s_addc_u32 s11, 0, s11
; GFX1032-NEXT: s_mul_hi_u32 s13, s3, s5
-; GFX1032-NEXT: s_add_u32 s8, s11, s8
+; GFX1032-NEXT: s_add_u32 s8, s9, s8
; GFX1032-NEXT: s_mul_i32 s5, s3, s5
-; GFX1032-NEXT: s_addc_u32 s8, s10, s9
+; GFX1032-NEXT: s_addc_u32 s8, s11, s10
; GFX1032-NEXT: s_addc_u32 s9, s13, 0
; GFX1032-NEXT: s_add_u32 s5, s8, s5
; GFX1032-NEXT: s_addc_u32 s8, 0, s9
@@ -824,11 +820,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: s_sub_i32 s11, s3, s9
; GFX1032-NEXT: s_sub_u32 s10, s2, s10
; GFX1032-NEXT: s_cselect_b32 s12, -1, 0
-; GFX1032-NEXT: s_cmp_lg_u32 s12, 0
; GFX1032-NEXT: s_subb_u32 s11, s11, s1
; GFX1032-NEXT: s_sub_u32 s13, s10, s0
-; GFX1032-NEXT: s_cselect_b32 s14, -1, 0
-; GFX1032-NEXT: s_cmp_lg_u32 s14, 0
; GFX1032-NEXT: s_subb_u32 s11, s11, 0
; GFX1032-NEXT: s_cmp_ge_u32 s11, s1
; GFX1032-NEXT: s_cselect_b32 s14, -1, 0
@@ -901,8 +894,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0
; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s1
-; GFX1064-NEXT: s_sub_u32 s9, 0, s0
-; GFX1064-NEXT: s_subb_u32 s10, 0, s1
+; GFX1064-NEXT: s_sub_u32 s8, 0, s0
+; GFX1064-NEXT: s_subb_u32 s9, 0, s1
; GFX1064-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX1064-NEXT: v_rcp_f32_e32 v0, v0
; GFX1064-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -911,109 +904,102 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1064-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GFX1064-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX1064-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s8, v1
-; GFX1064-NEXT: v_readfirstlane_b32 s4, v0
-; GFX1064-NEXT: s_mul_i32 s5, s9, s8
-; GFX1064-NEXT: s_mul_hi_u32 s12, s9, s4
-; GFX1064-NEXT: s_mul_i32 s11, s10, s4
-; GFX1064-NEXT: s_add_i32 s5, s12, s5
-; GFX1064-NEXT: s_mul_i32 s13, s9, s4
-; GFX1064-NEXT: s_add_i32 s5, s5, s11
-; GFX1064-NEXT: s_mul_hi_u32 s12, s4, s13
-; GFX1064-NEXT: s_mul_i32 s15, s4, s5
-; GFX1064-NEXT: s_mul_hi_u32 s14, s8, s13
-; GFX1064-NEXT: s_mul_i32 s11, s8, s13
-; GFX1064-NEXT: s_mul_hi_u32 s13, s4, s5
+; GFX1064-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1064-NEXT: v_readfirstlane_b32 s5, v0
+; GFX1064-NEXT: s_mul_i32 s10, s8, s4
+; GFX1064-NEXT: s_mul_hi_u32 s12, s8, s5
+; GFX1064-NEXT: s_mul_i32 s11, s9, s5
+; GFX1064-NEXT: s_add_i32 s10, s12, s10
+; GFX1064-NEXT: s_mul_i32 s13, s8, s5
+; GFX1064-NEXT: s_add_i32 s10, s10, s11
+; GFX1064-NEXT: s_mul_hi_u32 s12, s5, s13
+; GFX1064-NEXT: s_mul_i32 s15, s5, s10
+; GFX1064-NEXT: s_mul_hi_u32 s14, s4, s13
+; GFX1064-NEXT: s_mul_i32 s11, s4, s13
+; GFX1064-NEXT: s_mul_hi_u32 s13, s5, s10
; GFX1064-NEXT: s_add_u32 s12, s12, s15
; GFX1064-NEXT: s_addc_u32 s13, 0, s13
-; GFX1064-NEXT: s_mul_hi_u32 s16, s8, s5
+; GFX1064-NEXT: s_mul_hi_u32 s16, s4, s10
; GFX1064-NEXT: s_add_u32 s11, s12, s11
-; GFX1064-NEXT: s_mul_i32 s5, s8, s5
+; GFX1064-NEXT: s_mul_i32 s10, s4, s10
; GFX1064-NEXT: s_addc_u32 s11, s13, s14
; GFX1064-NEXT: s_addc_u32 s12, s16, 0
-; GFX1064-NEXT: s_add_u32 s5, s11, s5
+; GFX1064-NEXT: s_add_u32 s10, s11, s10
; GFX1064-NEXT: s_addc_u32 s11, 0, s12
-; GFX1064-NEXT: s_add_u32 s12, s4, s5
-; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX1064-NEXT: s_mul_hi_u32 s13, s9, s12
-; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT: s_mul_i32 s4, s9, s12
-; GFX1064-NEXT: s_addc_u32 s8, s8, s11
-; GFX1064-NEXT: s_mul_i32 s10, s10, s12
-; GFX1064-NEXT: s_mul_i32 s9, s9, s8
-; GFX1064-NEXT: s_mul_hi_u32 s5, s12, s4
-; GFX1064-NEXT: s_add_i32 s9, s13, s9
-; GFX1064-NEXT: s_mul_hi_u32 s11, s8, s4
-; GFX1064-NEXT: s_add_i32 s9, s9, s10
-; GFX1064-NEXT: s_mul_i32 s4, s8, s4
-; GFX1064-NEXT: s_mul_i32 s14, s12, s9
-; GFX1064-NEXT: s_mul_hi_u32 s13, s12, s9
-; GFX1064-NEXT: s_add_u32 s5, s5, s14
+; GFX1064-NEXT: s_add_u32 s5, s5, s10
+; GFX1064-NEXT: s_addc_u32 s4, s4, s11
+; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s5
+; GFX1064-NEXT: s_mul_i32 s11, s8, s5
+; GFX1064-NEXT: s_mul_i32 s8, s8, s4
+; GFX1064-NEXT: s_mul_i32 s9, s9, s5
+; GFX1064-NEXT: s_add_i32 s8, s10, s8
+; GFX1064-NEXT: s_mul_i32 s10, s4, s11
+; GFX1064-NEXT: s_add_i32 s8, s8, s9
+; GFX1064-NEXT: s_mul_hi_u32 s9, s5, s11
+; GFX1064-NEXT: s_mul_i32 s14, s5, s8
+; GFX1064-NEXT: s_mul_hi_u32 s13, s5, s8
+; GFX1064-NEXT: s_add_u32 s9, s9, s14
+; GFX1064-NEXT: s_mul_hi_u32 s12, s4, s11
; GFX1064-NEXT: s_addc_u32 s13, 0, s13
-; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s9
-; GFX1064-NEXT: s_add_u32 s4, s5, s4
-; GFX1064-NEXT: s_mul_i32 s9, s8, s9
-; GFX1064-NEXT: s_addc_u32 s4, s13, s11
-; GFX1064-NEXT: s_addc_u32 s5, s10, 0
-; GFX1064-NEXT: s_add_u32 s4, s4, s9
-; GFX1064-NEXT: s_addc_u32 s9, 0, s5
-; GFX1064-NEXT: s_add_u32 s10, s12, s4
-; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX1064-NEXT: s_mul_hi_u32 s11, s2, s10
-; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT: s_mul_hi_u32 s4, s3, s10
-; GFX1064-NEXT: s_addc_u32 s5, s8, s9
-; GFX1064-NEXT: s_mul_i32 s8, s3, s10
-; GFX1064-NEXT: s_mul_i32 s10, s2, s5
-; GFX1064-NEXT: s_mul_hi_u32 s9, s2, s5
-; GFX1064-NEXT: s_add_u32 s10, s11, s10
-; GFX1064-NEXT: s_addc_u32 s9, 0, s9
-; GFX1064-NEXT: s_mul_hi_u32 s12, s3, s5
-; GFX1064-NEXT: s_add_u32 s8, s10, s8
+; GFX1064-NEXT: s_mul_hi_u32 s11, s4, s8
+; GFX1064-NEXT: s_add_u32 s9, s9, s10
+; GFX1064-NEXT: s_mul_i32 s8, s4, s8
+; GFX1064-NEXT: s_addc_u32 s9, s13, s12
+; GFX1064-NEXT: s_addc_u32 s10, s11, 0
+; GFX1064-NEXT: s_add_u32 s8, s9, s8
+; GFX1064-NEXT: s_addc_u32 s9, 0, s10
+; GFX1064-NEXT: s_add_u32 s5, s5, s8
+; GFX1064-NEXT: s_addc_u32 s4, s4, s9
+; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s5
+; GFX1064-NEXT: s_mul_i32 s11, s2, s4
+; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s4
+; GFX1064-NEXT: s_mul_hi_u32 s9, s3, s5
; GFX1064-NEXT: s_mul_i32 s5, s3, s5
-; GFX1064-NEXT: s_addc_u32 s4, s9, s4
+; GFX1064-NEXT: s_add_u32 s8, s8, s11
+; GFX1064-NEXT: s_addc_u32 s10, 0, s10
+; GFX1064-NEXT: s_mul_hi_u32 s12, s3, s4
+; GFX1064-NEXT: s_add_u32 s5, s8, s5
+; GFX1064-NEXT: s_mul_i32 s4, s3, s4
+; GFX1064-NEXT: s_addc_u32 s5, s10, s9
; GFX1064-NEXT: s_addc_u32 s8, s12, 0
-; GFX1064-NEXT: s_add_u32 s10, s4, s5
+; GFX1064-NEXT: s_add_u32 s10, s5, s4
; GFX1064-NEXT: s_addc_u32 s11, 0, s8
; GFX1064-NEXT: s_mul_hi_u32 s4, s0, s10
; GFX1064-NEXT: s_mul_i32 s5, s0, s11
; GFX1064-NEXT: s_mul_i32 s8, s1, s10
; GFX1064-NEXT: s_add_i32 s4, s4, s5
-; GFX1064-NEXT: s_add_i32 s12, s4, s8
+; GFX1064-NEXT: s_add_i32 s8, s4, s8
; GFX1064-NEXT: s_mul_i32 s4, s0, s10
-; GFX1064-NEXT: s_sub_i32 s8, s3, s12
-; GFX1064-NEXT: s_sub_u32 s13, s2, s4
+; GFX1064-NEXT: s_sub_i32 s9, s3, s8
+; GFX1064-NEXT: s_sub_u32 s12, s2, s4
; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT: s_subb_u32 s14, s8, s1
-; GFX1064-NEXT: s_sub_u32 s15, s13, s0
-; GFX1064-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX1064-NEXT: s_cmp_lg_u64 s[8:9], 0
-; GFX1064-NEXT: s_subb_u32 s8, s14, 0
-; GFX1064-NEXT: s_cmp_ge_u32 s8, s1
-; GFX1064-NEXT: s_cselect_b32 s9, -1, 0
-; GFX1064-NEXT: s_cmp_ge_u32 s15, s0
+; GFX1064-NEXT: s_subb_u32 s9, s9, s1
+; GFX1064-NEXT: s_sub_u32 s13, s12, s0
+; GFX1064-NEXT: s_subb_u32 s9, s9, 0
+; GFX1064-NEXT: s_cmp_ge_u32 s9, s1
; GFX1064-NEXT: s_cselect_b32 s14, -1, 0
-; GFX1064-NEXT: s_cmp_eq_u32 s8, s1
-; GFX1064-NEXT: s_cselect_b32 s8, s14, s9
-; GFX1064-NEXT: s_add_u32 s9, s10, 1
+; GFX1064-NEXT: s_cmp_ge_u32 s13, s0
+; GFX1064-NEXT: s_cselect_b32 s13, -1, 0
+; GFX1064-NEXT: s_cmp_eq_u32 s9, s1
+; GFX1064-NEXT: s_cselect_b32 s9, s13, s14
+; GFX1064-NEXT: s_add_u32 s13, s10, 1
; GFX1064-NEXT: s_addc_u32 s14, s11, 0
; GFX1064-NEXT: s_add_u32 s15, s10, 2
; GFX1064-NEXT: s_addc_u32 s16, s11, 0
-; GFX1064-NEXT: s_cmp_lg_u32 s8, 0
-; GFX1064-NEXT: s_cselect_b32 s15, s15, s9
+; GFX1064-NEXT: s_cmp_lg_u32 s9, 0
+; GFX1064-NEXT: s_cselect_b32 s13, s15, s13
; GFX1064-NEXT: s_cselect_b32 s14, s16, s14
; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT: s_subb_u32 s3, s3, s12
+; GFX1064-NEXT: s_subb_u32 s3, s3, s8
; GFX1064-NEXT: s_cmp_ge_u32 s3, s1
; GFX1064-NEXT: s_cselect_b32 s4, -1, 0
-; GFX1064-NEXT: s_cmp_ge_u32 s13, s0
+; GFX1064-NEXT: s_cmp_ge_u32 s12, s0
; GFX1064-NEXT: s_cselect_b32 s5, -1, 0
; GFX1064-NEXT: s_cmp_eq_u32 s3, s1
; GFX1064-NEXT: s_cselect_b32 s1, s5, s4
; GFX1064-NEXT: s_cmp_lg_u32 s1, 0
; GFX1064-NEXT: s_cselect_b32 s5, s14, s11
-; GFX1064-NEXT: s_cselect_b32 s4, s15, s10
+; GFX1064-NEXT: s_cselect_b32 s4, s13, s10
; GFX1064-NEXT: s_cbranch_execnz .LBB15_3
; GFX1064-NEXT: .LBB15_2:
; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
index 64d055b..4445383 100644
--- a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
+++ b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
@@ -271,7 +271,6 @@ define i1 @workgroup_nonzero() {
; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX8-NEXT: s_or_b32 s4, s12, s13
; DAGISEL-GFX8-NEXT: s_or_b32 s4, s4, s14
-; DAGISEL-GFX8-NEXT: s_cmp_lg_u32 s4, 0
; DAGISEL-GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0
; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -281,7 +280,6 @@ define i1 @workgroup_nonzero() {
; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX942-NEXT: s_or_b32 s0, s12, s13
; DAGISEL-GFX942-NEXT: s_or_b32 s0, s0, s14
-; DAGISEL-GFX942-NEXT: s_cmp_lg_u32 s0, 0
; DAGISEL-GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0
; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; DAGISEL-GFX942-NEXT: s_setpc_b64 s[30:31]
@@ -299,8 +297,6 @@ define i1 @workgroup_nonzero() {
; DAGISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
; DAGISEL-GFX12-NEXT: s_or_b32 s0, s0, s1
-; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
-; DAGISEL-GFX12-NEXT: s_cmp_lg_u32 s0, 0
; DAGISEL-GFX12-NEXT: s_cselect_b32 s0, -1, 0
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
; DAGISEL-GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
@@ -311,7 +307,6 @@ define i1 @workgroup_nonzero() {
; GISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX8-NEXT: s_or_b32 s4, s12, s13
; GISEL-GFX8-NEXT: s_or_b32 s4, s4, s14
-; GISEL-GFX8-NEXT: s_cmp_lg_u32 s4, 0
; GISEL-GFX8-NEXT: s_cselect_b32 s4, 1, 0
; GISEL-GFX8-NEXT: v_mov_b32_e32 v0, s4
; GISEL-GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -321,7 +316,6 @@ define i1 @workgroup_nonzero() {
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX942-NEXT: s_or_b32 s0, s12, s13
; GISEL-GFX942-NEXT: s_or_b32 s0, s0, s14
-; GISEL-GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GISEL-GFX942-NEXT: s_cselect_b32 s0, 1, 0
; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s0
; GISEL-GFX942-NEXT: s_setpc_b64 s[30:31]
@@ -339,8 +333,6 @@ define i1 @workgroup_nonzero() {
; GISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0
; GISEL-GFX12-NEXT: s_wait_alu 0xfffe
; GISEL-GFX12-NEXT: s_or_b32 s0, s0, s1
-; GISEL-GFX12-NEXT: s_wait_alu 0xfffe
-; GISEL-GFX12-NEXT: s_cmp_lg_u32 s0, 0
; GISEL-GFX12-NEXT: s_cselect_b32 s0, 1, 0
; GISEL-GFX12-NEXT: s_wait_alu 0xfffe
; GISEL-GFX12-NEXT: v_mov_b32_e32 v0, s0