aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll818
1 files changed, 178 insertions, 640 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index f4040f3..bd4ee03 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -256,17 +256,15 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB10_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB10_4
; GFX1250-SDAG-NEXT: .LBB10_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB10_5
@@ -276,16 +274,15 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB10_2
; GFX1250-SDAG-NEXT: .LBB10_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB10_5
@@ -307,11 +304,9 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB10_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB10_4
; GFX1250-GISEL-NEXT: .LBB10_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB10_5
@@ -321,17 +316,14 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB10_2
; GFX1250-GISEL-NEXT: .LBB10_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[4:5], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB10_5
@@ -350,22 +342,19 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB11_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB11_4
; GFX1250-SDAG-NEXT: .LBB11_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB11_5
@@ -375,7 +364,6 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB11_2
; GFX1250-SDAG-NEXT: .LBB11_4: ; %atomicrmw.private
@@ -384,7 +372,6 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB11_5
@@ -400,9 +387,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -410,11 +396,9 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB11_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB11_4
; GFX1250-GISEL-NEXT: .LBB11_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB11_5
@@ -424,17 +408,14 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX1250-GISEL-NEXT: .LBB11_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[4:5], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB11_5
@@ -455,12 +436,11 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB12_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB12_4
; GFX1250-SDAG-NEXT: .LBB12_2: ; %atomicrmw.phi
@@ -472,10 +452,10 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB12_2
; GFX1250-SDAG-NEXT: .LBB12_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off
@@ -495,7 +475,6 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB12_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB12_4
; GFX1250-GISEL-NEXT: .LBB12_2: ; %atomicrmw.phi
@@ -507,12 +486,10 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB12_2
; GFX1250-GISEL-NEXT: .LBB12_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -529,17 +506,15 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB13_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB13_4
; GFX1250-SDAG-NEXT: .LBB13_2: ; %atomicrmw.phi
@@ -551,7 +526,6 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB13_2
; GFX1250-SDAG-NEXT: .LBB13_4: ; %atomicrmw.private
@@ -569,16 +543,14 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB13_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB13_4
; GFX1250-GISEL-NEXT: .LBB13_2: ; %atomicrmw.phi
@@ -590,12 +562,10 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB13_2
; GFX1250-GISEL-NEXT: .LBB13_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -676,17 +646,15 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB18_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB18_4
; GFX1250-SDAG-NEXT: .LBB18_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB18_5
@@ -696,18 +664,17 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB18_2
; GFX1250-SDAG-NEXT: .LBB18_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB18_5
; GFX1250-SDAG-NEXT: .LBB18_5:
@@ -728,11 +695,9 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB18_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB18_4
; GFX1250-GISEL-NEXT: .LBB18_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB18_5
@@ -742,19 +707,16 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB18_2
; GFX1250-GISEL-NEXT: .LBB18_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[4:5]
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB18_5
; GFX1250-GISEL-NEXT: .LBB18_5:
@@ -772,22 +734,19 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB19_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB19_4
; GFX1250-SDAG-NEXT: .LBB19_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB19_5
@@ -797,7 +756,6 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB19_2
; GFX1250-SDAG-NEXT: .LBB19_4: ; %atomicrmw.private
@@ -805,10 +763,9 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB19_5
; GFX1250-SDAG-NEXT: .LBB19_5:
@@ -823,9 +780,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -833,11 +789,9 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB19_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB19_4
; GFX1250-GISEL-NEXT: .LBB19_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB19_5
@@ -847,19 +801,16 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB19_2
; GFX1250-GISEL-NEXT: .LBB19_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[4:5]
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB19_5
; GFX1250-GISEL-NEXT: .LBB19_5:
@@ -879,12 +830,11 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB20_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB20_4
; GFX1250-SDAG-NEXT: .LBB20_2: ; %atomicrmw.phi
@@ -896,15 +846,15 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB20_2
; GFX1250-SDAG-NEXT: .LBB20_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
;
@@ -922,7 +872,6 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB20_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB20_4
; GFX1250-GISEL-NEXT: .LBB20_2: ; %atomicrmw.phi
@@ -934,16 +883,14 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB20_2
; GFX1250-GISEL-NEXT: .LBB20_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5]
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
@@ -959,17 +906,15 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB21_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB21_4
; GFX1250-SDAG-NEXT: .LBB21_2: ; %atomicrmw.phi
@@ -981,7 +926,6 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB21_2
; GFX1250-SDAG-NEXT: .LBB21_4: ; %atomicrmw.private
@@ -989,7 +933,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
;
@@ -1002,16 +946,14 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB21_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB21_4
; GFX1250-GISEL-NEXT: .LBB21_2: ; %atomicrmw.phi
@@ -1023,16 +965,14 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB21_2
; GFX1250-GISEL-NEXT: .LBB21_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5]
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
@@ -1112,17 +1052,15 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB26_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB26_4
; GFX1250-SDAG-NEXT: .LBB26_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB26_5
@@ -1131,21 +1069,18 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB26_2
; GFX1250-SDAG-NEXT: .LBB26_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB26_5
; GFX1250-SDAG-NEXT: .LBB26_5:
@@ -1166,11 +1101,9 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB26_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB26_4
; GFX1250-GISEL-NEXT: .LBB26_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB26_5
@@ -1179,22 +1112,17 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB26_2
; GFX1250-GISEL-NEXT: .LBB26_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v5, vcc_lo
+; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB26_5
; GFX1250-GISEL-NEXT: .LBB26_5:
@@ -1212,22 +1140,19 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB27_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB27_4
; GFX1250-SDAG-NEXT: .LBB27_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB27_5
@@ -1236,8 +1161,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB27_2
; GFX1250-SDAG-NEXT: .LBB27_4: ; %atomicrmw.private
@@ -1245,12 +1169,9 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB27_5
; GFX1250-SDAG-NEXT: .LBB27_5:
@@ -1265,9 +1186,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1275,11 +1195,9 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB27_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB27_4
; GFX1250-GISEL-NEXT: .LBB27_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB27_5
@@ -1288,22 +1206,17 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB27_2
; GFX1250-GISEL-NEXT: .LBB27_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v5, vcc_lo
+; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB27_5
; GFX1250-GISEL-NEXT: .LBB27_5:
@@ -1323,12 +1236,11 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB28_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB28_4
; GFX1250-SDAG-NEXT: .LBB28_2: ; %atomicrmw.phi
@@ -1338,19 +1250,17 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2
+; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB28_2
; GFX1250-SDAG-NEXT: .LBB28_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
;
@@ -1368,7 +1278,6 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB28_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB28_4
; GFX1250-GISEL-NEXT: .LBB28_2: ; %atomicrmw.phi
@@ -1378,20 +1287,16 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB28_2
; GFX1250-GISEL-NEXT: .LBB28_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
+; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
@@ -1407,17 +1312,15 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB29_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB29_4
; GFX1250-SDAG-NEXT: .LBB29_2: ; %atomicrmw.phi
@@ -1427,9 +1330,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2
+; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB29_2
; GFX1250-SDAG-NEXT: .LBB29_4: ; %atomicrmw.private
@@ -1437,9 +1339,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
;
@@ -1452,16 +1352,14 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB29_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB29_4
; GFX1250-GISEL-NEXT: .LBB29_2: ; %atomicrmw.phi
@@ -1471,20 +1369,16 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB29_2
; GFX1250-GISEL-NEXT: .LBB29_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
+; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
@@ -1564,17 +1458,15 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB34_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB34_4
; GFX1250-SDAG-NEXT: .LBB34_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB34_5
@@ -1584,10 +1476,10 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB34_2
; GFX1250-SDAG-NEXT: .LBB34_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -1596,7 +1488,6 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, v0, v2
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB34_5
; GFX1250-SDAG-NEXT: .LBB34_5:
@@ -1617,11 +1508,9 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB34_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB34_4
; GFX1250-GISEL-NEXT: .LBB34_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB34_5
@@ -1631,12 +1520,10 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB34_2
; GFX1250-GISEL-NEXT: .LBB34_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -1644,7 +1531,6 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: v_and_b32_e32 v3, v1, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB34_5
; GFX1250-GISEL-NEXT: .LBB34_5:
@@ -1662,22 +1548,19 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB35_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB35_4
; GFX1250-SDAG-NEXT: .LBB35_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB35_5
@@ -1687,7 +1570,6 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB35_2
; GFX1250-SDAG-NEXT: .LBB35_4: ; %atomicrmw.private
@@ -1699,7 +1581,6 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, v0, v2
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB35_5
; GFX1250-SDAG-NEXT: .LBB35_5:
@@ -1714,9 +1595,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1724,11 +1604,9 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB35_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB35_4
; GFX1250-GISEL-NEXT: .LBB35_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB35_5
@@ -1738,12 +1616,10 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB35_2
; GFX1250-GISEL-NEXT: .LBB35_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -1751,7 +1627,6 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: v_and_b32_e32 v3, v1, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB35_5
; GFX1250-GISEL-NEXT: .LBB35_5:
@@ -1771,12 +1646,11 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB36_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB36_4
; GFX1250-SDAG-NEXT: .LBB36_2: ; %atomicrmw.phi
@@ -1788,10 +1662,10 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB36_2
; GFX1250-SDAG-NEXT: .LBB36_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -1815,7 +1689,6 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB36_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB36_4
; GFX1250-GISEL-NEXT: .LBB36_2: ; %atomicrmw.phi
@@ -1827,12 +1700,10 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB36_2
; GFX1250-GISEL-NEXT: .LBB36_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -1853,17 +1724,15 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB37_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB37_4
; GFX1250-SDAG-NEXT: .LBB37_2: ; %atomicrmw.phi
@@ -1875,7 +1744,6 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB37_2
; GFX1250-SDAG-NEXT: .LBB37_4: ; %atomicrmw.private
@@ -1897,16 +1765,14 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB37_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB37_4
; GFX1250-GISEL-NEXT: .LBB37_2: ; %atomicrmw.phi
@@ -1918,12 +1784,10 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB37_2
; GFX1250-GISEL-NEXT: .LBB37_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -2008,17 +1872,15 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB42_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB42_4
; GFX1250-SDAG-NEXT: .LBB42_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB42_5
@@ -2028,10 +1890,10 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB42_2
; GFX1250-SDAG-NEXT: .LBB42_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -2040,7 +1902,6 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
; GFX1250-SDAG-NEXT: v_or_b32_e32 v2, v0, v2
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB42_5
; GFX1250-SDAG-NEXT: .LBB42_5:
@@ -2061,11 +1922,9 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB42_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB42_4
; GFX1250-GISEL-NEXT: .LBB42_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB42_5
@@ -2075,12 +1934,10 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB42_2
; GFX1250-GISEL-NEXT: .LBB42_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -2088,7 +1945,6 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
; GFX1250-GISEL-NEXT: v_or_b32_e32 v3, v1, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB42_5
; GFX1250-GISEL-NEXT: .LBB42_5:
@@ -2106,22 +1962,19 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB43_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB43_4
; GFX1250-SDAG-NEXT: .LBB43_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB43_5
@@ -2131,7 +1984,6 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB43_2
; GFX1250-SDAG-NEXT: .LBB43_4: ; %atomicrmw.private
@@ -2143,7 +1995,6 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-SDAG-NEXT: v_or_b32_e32 v2, v0, v2
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB43_5
; GFX1250-SDAG-NEXT: .LBB43_5:
@@ -2158,9 +2009,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -2168,11 +2018,9 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB43_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB43_4
; GFX1250-GISEL-NEXT: .LBB43_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB43_5
@@ -2182,12 +2030,10 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB43_2
; GFX1250-GISEL-NEXT: .LBB43_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -2195,7 +2041,6 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-GISEL-NEXT: v_or_b32_e32 v3, v1, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB43_5
; GFX1250-GISEL-NEXT: .LBB43_5:
@@ -2215,12 +2060,11 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB44_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB44_4
; GFX1250-SDAG-NEXT: .LBB44_2: ; %atomicrmw.phi
@@ -2232,10 +2076,10 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB44_2
; GFX1250-SDAG-NEXT: .LBB44_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -2259,7 +2103,6 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB44_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB44_4
; GFX1250-GISEL-NEXT: .LBB44_2: ; %atomicrmw.phi
@@ -2271,12 +2114,10 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB44_2
; GFX1250-GISEL-NEXT: .LBB44_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -2297,17 +2138,15 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB45_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB45_4
; GFX1250-SDAG-NEXT: .LBB45_2: ; %atomicrmw.phi
@@ -2319,7 +2158,6 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB45_2
; GFX1250-SDAG-NEXT: .LBB45_4: ; %atomicrmw.private
@@ -2341,16 +2179,14 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB45_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB45_4
; GFX1250-GISEL-NEXT: .LBB45_2: ; %atomicrmw.phi
@@ -2362,12 +2198,10 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB45_2
; GFX1250-GISEL-NEXT: .LBB45_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -2452,17 +2286,15 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB50_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB50_4
; GFX1250-SDAG-NEXT: .LBB50_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB50_5
@@ -2472,10 +2304,10 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB50_2
; GFX1250-SDAG-NEXT: .LBB50_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -2484,7 +2316,6 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, v0, v2
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB50_5
; GFX1250-SDAG-NEXT: .LBB50_5:
@@ -2505,11 +2336,9 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB50_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB50_4
; GFX1250-GISEL-NEXT: .LBB50_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB50_5
@@ -2519,12 +2348,10 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB50_2
; GFX1250-GISEL-NEXT: .LBB50_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -2532,7 +2359,6 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: v_xor_b32_e32 v3, v1, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB50_5
; GFX1250-GISEL-NEXT: .LBB50_5:
@@ -2550,22 +2376,19 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB51_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB51_4
; GFX1250-SDAG-NEXT: .LBB51_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB51_5
@@ -2575,7 +2398,6 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB51_2
; GFX1250-SDAG-NEXT: .LBB51_4: ; %atomicrmw.private
@@ -2587,7 +2409,6 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, v0, v2
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB51_5
; GFX1250-SDAG-NEXT: .LBB51_5:
@@ -2602,9 +2423,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -2612,11 +2432,9 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB51_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB51_4
; GFX1250-GISEL-NEXT: .LBB51_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB51_5
@@ -2626,12 +2444,10 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB51_2
; GFX1250-GISEL-NEXT: .LBB51_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -2639,7 +2455,6 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: v_xor_b32_e32 v3, v1, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB51_5
; GFX1250-GISEL-NEXT: .LBB51_5:
@@ -2659,12 +2474,11 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB52_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB52_4
; GFX1250-SDAG-NEXT: .LBB52_2: ; %atomicrmw.phi
@@ -2676,10 +2490,10 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB52_2
; GFX1250-SDAG-NEXT: .LBB52_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -2703,7 +2517,6 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB52_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB52_4
; GFX1250-GISEL-NEXT: .LBB52_2: ; %atomicrmw.phi
@@ -2715,12 +2528,10 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB52_2
; GFX1250-GISEL-NEXT: .LBB52_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -2741,17 +2552,15 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB53_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB53_4
; GFX1250-SDAG-NEXT: .LBB53_2: ; %atomicrmw.phi
@@ -2763,7 +2572,6 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB53_2
; GFX1250-SDAG-NEXT: .LBB53_4: ; %atomicrmw.private
@@ -2785,16 +2593,14 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB53_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB53_4
; GFX1250-GISEL-NEXT: .LBB53_2: ; %atomicrmw.phi
@@ -2806,12 +2612,10 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB53_2
; GFX1250-GISEL-NEXT: .LBB53_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -2890,17 +2694,15 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB58_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB58_4
; GFX1250-SDAG-NEXT: .LBB58_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB58_5
@@ -2910,21 +2712,19 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB58_2
; GFX1250-SDAG-NEXT: .LBB58_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB58_5
; GFX1250-SDAG-NEXT: .LBB58_5:
@@ -2945,11 +2745,9 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB58_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB58_4
; GFX1250-GISEL-NEXT: .LBB58_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB58_5
@@ -2959,22 +2757,18 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB58_2
; GFX1250-GISEL-NEXT: .LBB58_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB58_5
; GFX1250-GISEL-NEXT: .LBB58_5:
@@ -2992,22 +2786,19 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB59_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB59_4
; GFX1250-SDAG-NEXT: .LBB59_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB59_5
@@ -3017,7 +2808,6 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB59_2
; GFX1250-SDAG-NEXT: .LBB59_4: ; %atomicrmw.private
@@ -3027,11 +2817,9 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB59_5
; GFX1250-SDAG-NEXT: .LBB59_5:
@@ -3046,9 +2834,8 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -3056,11 +2843,9 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB59_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB59_4
; GFX1250-GISEL-NEXT: .LBB59_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB59_5
@@ -3070,22 +2855,18 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB59_2
; GFX1250-GISEL-NEXT: .LBB59_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB59_5
; GFX1250-GISEL-NEXT: .LBB59_5:
@@ -3105,12 +2886,11 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB60_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB60_4
; GFX1250-SDAG-NEXT: .LBB60_2: ; %atomicrmw.phi
@@ -3121,16 +2901,15 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB60_2
; GFX1250-SDAG-NEXT: .LBB60_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
@@ -3149,7 +2928,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB60_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB60_4
; GFX1250-GISEL-NEXT: .LBB60_2: ; %atomicrmw.phi
@@ -3160,17 +2938,14 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB60_2
; GFX1250-GISEL-NEXT: .LBB60_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -3187,17 +2962,15 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB61_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB61_4
; GFX1250-SDAG-NEXT: .LBB61_2: ; %atomicrmw.phi
@@ -3208,7 +2981,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB61_2
; GFX1250-SDAG-NEXT: .LBB61_4: ; %atomicrmw.private
@@ -3217,7 +2989,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
@@ -3231,16 +3002,14 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB61_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB61_4
; GFX1250-GISEL-NEXT: .LBB61_2: ; %atomicrmw.phi
@@ -3251,17 +3020,14 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB61_2
; GFX1250-GISEL-NEXT: .LBB61_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -3336,17 +3102,15 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB66_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB66_4
; GFX1250-SDAG-NEXT: .LBB66_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB66_5
@@ -3356,21 +3120,19 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB66_2
; GFX1250-SDAG-NEXT: .LBB66_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_le_i64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB66_5
; GFX1250-SDAG-NEXT: .LBB66_5:
@@ -3391,11 +3153,9 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB66_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB66_4
; GFX1250-GISEL-NEXT: .LBB66_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB66_5
@@ -3405,22 +3165,18 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB66_2
; GFX1250-GISEL-NEXT: .LBB66_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB66_5
; GFX1250-GISEL-NEXT: .LBB66_5:
@@ -3438,22 +3194,19 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB67_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB67_4
; GFX1250-SDAG-NEXT: .LBB67_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB67_5
@@ -3463,7 +3216,6 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB67_2
; GFX1250-SDAG-NEXT: .LBB67_4: ; %atomicrmw.private
@@ -3473,11 +3225,9 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_le_i64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB67_5
; GFX1250-SDAG-NEXT: .LBB67_5:
@@ -3492,9 +3242,8 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -3502,11 +3251,9 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB67_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB67_4
; GFX1250-GISEL-NEXT: .LBB67_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB67_5
@@ -3516,22 +3263,18 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB67_2
; GFX1250-GISEL-NEXT: .LBB67_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB67_5
; GFX1250-GISEL-NEXT: .LBB67_5:
@@ -3551,12 +3294,11 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB68_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB68_4
; GFX1250-SDAG-NEXT: .LBB68_2: ; %atomicrmw.phi
@@ -3567,16 +3309,15 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB68_2
; GFX1250-SDAG-NEXT: .LBB68_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_le_i64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
@@ -3595,7 +3336,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB68_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB68_4
; GFX1250-GISEL-NEXT: .LBB68_2: ; %atomicrmw.phi
@@ -3606,17 +3346,14 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB68_2
; GFX1250-GISEL-NEXT: .LBB68_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -3633,17 +3370,15 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB69_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB69_4
; GFX1250-SDAG-NEXT: .LBB69_2: ; %atomicrmw.phi
@@ -3654,7 +3389,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB69_2
; GFX1250-SDAG-NEXT: .LBB69_4: ; %atomicrmw.private
@@ -3663,7 +3397,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_le_i64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
@@ -3677,16 +3410,14 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB69_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB69_4
; GFX1250-GISEL-NEXT: .LBB69_2: ; %atomicrmw.phi
@@ -3697,17 +3428,14 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB69_2
; GFX1250-GISEL-NEXT: .LBB69_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -3782,17 +3510,15 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB74_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB74_4
; GFX1250-SDAG-NEXT: .LBB74_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB74_5
@@ -3802,21 +3528,19 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB74_2
; GFX1250-SDAG-NEXT: .LBB74_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB74_5
; GFX1250-SDAG-NEXT: .LBB74_5:
@@ -3837,11 +3561,9 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB74_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB74_4
; GFX1250-GISEL-NEXT: .LBB74_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB74_5
@@ -3851,22 +3573,18 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB74_2
; GFX1250-GISEL-NEXT: .LBB74_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB74_5
; GFX1250-GISEL-NEXT: .LBB74_5:
@@ -3884,22 +3602,19 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB75_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB75_4
; GFX1250-SDAG-NEXT: .LBB75_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB75_5
@@ -3909,7 +3624,6 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB75_2
; GFX1250-SDAG-NEXT: .LBB75_4: ; %atomicrmw.private
@@ -3919,11 +3633,9 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB75_5
; GFX1250-SDAG-NEXT: .LBB75_5:
@@ -3938,9 +3650,8 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -3948,11 +3659,9 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB75_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB75_4
; GFX1250-GISEL-NEXT: .LBB75_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB75_5
@@ -3962,22 +3671,18 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB75_2
; GFX1250-GISEL-NEXT: .LBB75_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB75_5
; GFX1250-GISEL-NEXT: .LBB75_5:
@@ -3997,12 +3702,11 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB76_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB76_4
; GFX1250-SDAG-NEXT: .LBB76_2: ; %atomicrmw.phi
@@ -4013,16 +3717,15 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB76_2
; GFX1250-SDAG-NEXT: .LBB76_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
@@ -4041,7 +3744,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB76_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB76_4
; GFX1250-GISEL-NEXT: .LBB76_2: ; %atomicrmw.phi
@@ -4052,17 +3754,14 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB76_2
; GFX1250-GISEL-NEXT: .LBB76_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -4079,17 +3778,15 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB77_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB77_4
; GFX1250-SDAG-NEXT: .LBB77_2: ; %atomicrmw.phi
@@ -4100,7 +3797,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB77_2
; GFX1250-SDAG-NEXT: .LBB77_4: ; %atomicrmw.private
@@ -4109,7 +3805,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
@@ -4123,16 +3818,14 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB77_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB77_4
; GFX1250-GISEL-NEXT: .LBB77_2: ; %atomicrmw.phi
@@ -4143,17 +3836,14 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB77_2
; GFX1250-GISEL-NEXT: .LBB77_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -4228,17 +3918,15 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB82_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB82_4
; GFX1250-SDAG-NEXT: .LBB82_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB82_5
@@ -4248,21 +3936,19 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB82_2
; GFX1250-SDAG-NEXT: .LBB82_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_le_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB82_5
; GFX1250-SDAG-NEXT: .LBB82_5:
@@ -4283,11 +3969,9 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB82_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB82_4
; GFX1250-GISEL-NEXT: .LBB82_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB82_5
@@ -4297,22 +3981,18 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB82_2
; GFX1250-GISEL-NEXT: .LBB82_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB82_5
; GFX1250-GISEL-NEXT: .LBB82_5:
@@ -4330,22 +4010,19 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB83_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB83_4
; GFX1250-SDAG-NEXT: .LBB83_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB83_5
@@ -4355,7 +4032,6 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB83_2
; GFX1250-SDAG-NEXT: .LBB83_4: ; %atomicrmw.private
@@ -4365,11 +4041,9 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_le_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB83_5
; GFX1250-SDAG-NEXT: .LBB83_5:
@@ -4384,9 +4058,8 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -4394,11 +4067,9 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB83_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB83_4
; GFX1250-GISEL-NEXT: .LBB83_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB83_5
@@ -4408,22 +4079,18 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB83_2
; GFX1250-GISEL-NEXT: .LBB83_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB83_5
; GFX1250-GISEL-NEXT: .LBB83_5:
@@ -4443,12 +4110,11 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB84_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB84_4
; GFX1250-SDAG-NEXT: .LBB84_2: ; %atomicrmw.phi
@@ -4459,16 +4125,15 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB84_2
; GFX1250-SDAG-NEXT: .LBB84_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_le_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
@@ -4487,7 +4152,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB84_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB84_4
; GFX1250-GISEL-NEXT: .LBB84_2: ; %atomicrmw.phi
@@ -4498,17 +4162,14 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB84_2
; GFX1250-GISEL-NEXT: .LBB84_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -4525,17 +4186,15 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB85_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB85_4
; GFX1250-SDAG-NEXT: .LBB85_2: ; %atomicrmw.phi
@@ -4546,7 +4205,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB85_2
; GFX1250-SDAG-NEXT: .LBB85_4: ; %atomicrmw.private
@@ -4555,7 +4213,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_le_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
@@ -4569,16 +4226,14 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB85_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB85_4
; GFX1250-GISEL-NEXT: .LBB85_2: ; %atomicrmw.phi
@@ -4589,17 +4244,14 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB85_2
; GFX1250-GISEL-NEXT: .LBB85_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -4695,17 +4347,15 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB90_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB90_4
; GFX1250-SDAG-NEXT: .LBB90_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB90_5
@@ -4717,20 +4367,18 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB90_2
; GFX1250-SDAG-NEXT: .LBB90_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v2, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v8, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v1, v5 :: v_dual_cndmask_b32 v2, v0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v8, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB90_5
; GFX1250-SDAG-NEXT: .LBB90_5:
@@ -4752,11 +4400,9 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB90_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB90_4
; GFX1250-GISEL-NEXT: .LBB90_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB90_5
@@ -4768,21 +4414,17 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB90_2
; GFX1250-GISEL-NEXT: .LBB90_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v0, v6 :: v_dual_cndmask_b32 v3, v1, v7
; GFX1250-GISEL-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB90_5
; GFX1250-GISEL-NEXT: .LBB90_5:
@@ -4802,22 +4444,19 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB91_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB91_4
; GFX1250-SDAG-NEXT: .LBB91_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB91_5
@@ -4829,7 +4468,6 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB91_2
; GFX1250-SDAG-NEXT: .LBB91_4: ; %atomicrmw.private
@@ -4838,11 +4476,9 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v8, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v1, v5 :: v_dual_cndmask_b32 v2, v0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v8, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB91_5
; GFX1250-SDAG-NEXT: .LBB91_5:
@@ -4858,9 +4494,8 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -4868,11 +4503,9 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB91_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB91_4
; GFX1250-GISEL-NEXT: .LBB91_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB91_5
@@ -4884,21 +4517,17 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB91_2
; GFX1250-GISEL-NEXT: .LBB91_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v0, v6 :: v_dual_cndmask_b32 v3, v1, v7
; GFX1250-GISEL-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB91_5
; GFX1250-GISEL-NEXT: .LBB91_5:
@@ -4920,12 +4549,11 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB92_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB92_4
; GFX1250-SDAG-NEXT: .LBB92_2: ; %atomicrmw.phi
@@ -4939,16 +4567,15 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB92_2
; GFX1250-SDAG-NEXT: .LBB92_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_cndmask_b32 v0, v0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
@@ -4968,7 +4595,6 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB92_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB92_4
; GFX1250-GISEL-NEXT: .LBB92_2: ; %atomicrmw.phi
@@ -4982,17 +4608,14 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB92_2
; GFX1250-GISEL-NEXT: .LBB92_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_cndmask_b32 v1, v1, v7
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -5010,17 +4633,15 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB93_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB93_4
; GFX1250-SDAG-NEXT: .LBB93_2: ; %atomicrmw.phi
@@ -5034,7 +4655,6 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB93_2
; GFX1250-SDAG-NEXT: .LBB93_4: ; %atomicrmw.private
@@ -5043,7 +4663,6 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_cndmask_b32 v0, v0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
@@ -5058,16 +4677,14 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB93_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB93_4
; GFX1250-GISEL-NEXT: .LBB93_2: ; %atomicrmw.phi
@@ -5081,17 +4698,14 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB93_2
; GFX1250-GISEL-NEXT: .LBB93_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_cndmask_b32 v1, v1, v7
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -5164,17 +4778,15 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB98_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB98_4
; GFX1250-SDAG-NEXT: .LBB98_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB98_5
@@ -5183,23 +4795,21 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB98_2
; GFX1250-SDAG-NEXT: .LBB98_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB98_5
; GFX1250-SDAG-NEXT: .LBB98_5:
@@ -5220,11 +4830,9 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB98_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB98_4
; GFX1250-GISEL-NEXT: .LBB98_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB98_5
@@ -5233,25 +4841,21 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB98_2
; GFX1250-GISEL-NEXT: .LBB98_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB98_5
; GFX1250-GISEL-NEXT: .LBB98_5:
@@ -5269,22 +4873,19 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB99_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB99_4
; GFX1250-SDAG-NEXT: .LBB99_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB99_5
@@ -5293,7 +4894,6 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB99_2
; GFX1250-SDAG-NEXT: .LBB99_4: ; %atomicrmw.private
@@ -5302,14 +4902,12 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB99_5
; GFX1250-SDAG-NEXT: .LBB99_5:
@@ -5324,9 +4922,8 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -5334,11 +4931,9 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB99_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB99_4
; GFX1250-GISEL-NEXT: .LBB99_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB99_5
@@ -5347,25 +4942,21 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB99_2
; GFX1250-GISEL-NEXT: .LBB99_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB99_5
; GFX1250-GISEL-NEXT: .LBB99_5:
@@ -5385,12 +4976,11 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB100_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB100_4
; GFX1250-SDAG-NEXT: .LBB100_2: ; %atomicrmw.phi
@@ -5400,18 +4990,17 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB100_2
; GFX1250-SDAG-NEXT: .LBB100_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, 0, v5 :: v_dual_cndmask_b32 v0, 0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
@@ -5430,7 +5019,6 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB100_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB100_4
; GFX1250-GISEL-NEXT: .LBB100_2: ; %atomicrmw.phi
@@ -5440,19 +5028,16 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB100_2
; GFX1250-GISEL-NEXT: .LBB100_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off
@@ -5470,17 +5055,15 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB101_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB101_4
; GFX1250-SDAG-NEXT: .LBB101_2: ; %atomicrmw.phi
@@ -5490,7 +5073,6 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB101_2
; GFX1250-SDAG-NEXT: .LBB101_4: ; %atomicrmw.private
@@ -5498,10 +5080,9 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, 0, v5 :: v_dual_cndmask_b32 v0, 0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
@@ -5515,16 +5096,14 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB101_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB101_4
; GFX1250-GISEL-NEXT: .LBB101_2: ; %atomicrmw.phi
@@ -5534,19 +5113,16 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB101_2
; GFX1250-GISEL-NEXT: .LBB101_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off
@@ -5621,17 +5197,15 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB106_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB106_4
; GFX1250-SDAG-NEXT: .LBB106_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB106_5
@@ -5640,10 +5214,10 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB106_2
; GFX1250-SDAG-NEXT: .LBB106_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -5651,9 +5225,8 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], -1, v[0:1]
; GFX1250-SDAG-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v2, v4, v2
; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[2:3], off
@@ -5678,11 +5251,9 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB106_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB106_4
; GFX1250-GISEL-NEXT: .LBB106_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB106_5
@@ -5691,21 +5262,18 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB106_2
; GFX1250-GISEL-NEXT: .LBB106_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, -1
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], -1, v[0:1]
; GFX1250-GISEL-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v2, v4 :: v_dual_cndmask_b32 v3, v3, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
@@ -5727,22 +5295,19 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB107_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB107_4
; GFX1250-SDAG-NEXT: .LBB107_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB107_5
@@ -5751,7 +5316,6 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB107_2
; GFX1250-SDAG-NEXT: .LBB107_4: ; %atomicrmw.private
@@ -5762,9 +5326,8 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], -1, v[0:1]
; GFX1250-SDAG-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v2, v4, v2
; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[2:3], off
@@ -5783,9 +5346,8 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -5793,11 +5355,9 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB107_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB107_4
; GFX1250-GISEL-NEXT: .LBB107_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB107_5
@@ -5806,21 +5366,18 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB107_2
; GFX1250-GISEL-NEXT: .LBB107_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, -1
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], -1, v[0:1]
; GFX1250-GISEL-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v2, v4 :: v_dual_cndmask_b32 v3, v3, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
@@ -5844,12 +5401,11 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB108_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB108_4
; GFX1250-SDAG-NEXT: .LBB108_2: ; %atomicrmw.phi
@@ -5859,20 +5415,18 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB108_2
; GFX1250-SDAG-NEXT: .LBB108_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, -1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], -1, v[0:1]
; GFX1250-SDAG-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_cndmask_b32 v0, v0, v2
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
@@ -5892,7 +5446,6 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB108_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB108_4
; GFX1250-GISEL-NEXT: .LBB108_2: ; %atomicrmw.phi
@@ -5902,21 +5455,17 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB108_2
; GFX1250-GISEL-NEXT: .LBB108_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, -1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], -1, v[0:1]
; GFX1250-GISEL-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
@@ -5934,17 +5483,15 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB109_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB109_4
; GFX1250-SDAG-NEXT: .LBB109_2: ; %atomicrmw.phi
@@ -5954,7 +5501,6 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB109_2
; GFX1250-SDAG-NEXT: .LBB109_4: ; %atomicrmw.private
@@ -5964,10 +5510,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, -1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], -1, v[0:1]
; GFX1250-SDAG-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_cndmask_b32 v0, v0, v2
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
@@ -5982,16 +5526,14 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB109_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB109_4
; GFX1250-GISEL-NEXT: .LBB109_2: ; %atomicrmw.phi
@@ -6001,21 +5543,17 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB109_2
; GFX1250-GISEL-NEXT: .LBB109_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, -1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], -1, v[0:1]
; GFX1250-GISEL-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off