diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll | 1081 |
1 files changed, 416 insertions, 665 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index 1e7855c..eefc781 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -541,11 +541,10 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -570,9 +569,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB10_2 ; GFX1250-SDAG-NEXT: .LBB10_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: s_clause 0x1 @@ -586,14 +584,13 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -618,10 +615,9 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB10_2 ; GFX1250-GISEL-NEXT: .LBB10_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: s_clause 0x1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -727,13 +723,12 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB11_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -754,9 +749,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB11_2 ; GFX1250-SDAG-NEXT: .LBB11_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: s_clause 0x1 @@ -770,8 +764,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -780,7 +773,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -805,10 +798,9 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB11_2 ; GFX1250-GISEL-NEXT: .LBB11_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: s_clause 0x1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -917,11 +909,10 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -943,9 +934,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB12_2 ; GFX1250-SDAG-NEXT: .LBB12_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off @@ -953,15 +943,14 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB12_3 @@ -982,10 +971,9 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB12_2 ; GFX1250-GISEL-NEXT: .LBB12_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off ; GFX1250-GISEL-NEXT: s_endpgm @@ -1069,11 +1057,9 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB13_3 @@ -1094,9 +1080,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB13_2 ; GFX1250-SDAG-NEXT: .LBB13_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off @@ -1104,8 +1089,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1115,7 +1099,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB13_3 @@ -1136,10 +1120,9 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB13_2 ; GFX1250-GISEL-NEXT: .LBB13_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off ; GFX1250-GISEL-NEXT: s_endpgm @@ -1400,11 +1383,10 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_add_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -1429,9 +1411,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB18_2 ; GFX1250-SDAG-NEXT: .LBB18_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -1445,14 +1426,13 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -1477,10 +1457,9 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB18_2 ; GFX1250-GISEL-NEXT: .LBB18_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -1590,13 +1569,12 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB19_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -1617,9 +1595,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB19_2 ; GFX1250-SDAG-NEXT: .LBB19_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -1633,8 +1610,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -1643,7 +1619,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -1668,10 +1644,9 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB19_2 ; GFX1250-GISEL-NEXT: .LBB19_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -1784,11 +1759,10 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -1810,9 +1784,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB20_2 ; GFX1250-SDAG-NEXT: .LBB20_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -1823,15 +1796,14 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB20_3 @@ -1852,10 +1824,9 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB20_2 ; GFX1250-GISEL-NEXT: .LBB20_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -1950,11 +1921,9 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB21_3 @@ -1975,9 +1944,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB21_2 ; GFX1250-SDAG-NEXT: .LBB21_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -1988,8 +1956,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1999,7 +1966,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB21_3 @@ -2020,10 +1987,9 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB21_2 ; GFX1250-GISEL-NEXT: .LBB21_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -2295,11 +2261,10 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2324,9 +2289,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB26_2 ; GFX1250-SDAG-NEXT: .LBB26_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -2340,14 +2304,13 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -2372,10 +2335,9 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB26_2 ; GFX1250-GISEL-NEXT: .LBB26_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -2487,13 +2449,12 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB27_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -2514,9 +2475,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB27_2 ; GFX1250-SDAG-NEXT: .LBB27_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -2530,8 +2490,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -2540,7 +2499,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -2565,10 +2524,9 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB27_2 ; GFX1250-GISEL-NEXT: .LBB27_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -2683,11 +2641,10 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -2709,9 +2666,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB28_2 ; GFX1250-SDAG-NEXT: .LBB28_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -2722,15 +2678,14 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB28_3 @@ -2751,10 +2706,9 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB28_2 ; GFX1250-GISEL-NEXT: .LBB28_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -2851,11 +2805,9 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB29_3 @@ -2876,9 +2828,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB29_2 ; GFX1250-SDAG-NEXT: .LBB29_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -2889,8 +2840,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2900,7 +2850,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB29_3 @@ -2921,10 +2871,9 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB29_2 ; GFX1250-GISEL-NEXT: .LBB29_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -3198,11 +3147,10 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_and_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -3227,9 +3175,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB34_2 ; GFX1250-SDAG-NEXT: .LBB34_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -3244,14 +3191,13 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -3276,10 +3222,9 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB34_2 ; GFX1250-GISEL-NEXT: .LBB34_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -3390,13 +3335,12 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB35_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -3417,9 +3361,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB35_2 ; GFX1250-SDAG-NEXT: .LBB35_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -3434,8 +3377,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -3444,7 +3386,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -3469,10 +3411,9 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB35_2 ; GFX1250-GISEL-NEXT: .LBB35_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -3586,11 +3527,10 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -3612,9 +3552,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB36_2 ; GFX1250-SDAG-NEXT: .LBB36_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -3626,15 +3565,14 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB36_3 @@ -3655,10 +3593,9 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB36_2 ; GFX1250-GISEL-NEXT: .LBB36_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -3754,11 +3691,9 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB37_3 @@ -3779,9 +3714,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB37_2 ; GFX1250-SDAG-NEXT: .LBB37_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -3793,8 +3727,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3804,7 +3737,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB37_3 @@ -3825,10 +3758,9 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB37_2 ; GFX1250-GISEL-NEXT: .LBB37_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -4101,11 +4033,10 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-LABEL: flat_or_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -4130,9 +4061,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB42_2 ; GFX1250-SDAG-NEXT: .LBB42_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -4147,14 +4077,13 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -4179,10 +4108,9 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB42_2 ; GFX1250-GISEL-NEXT: .LBB42_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -4293,13 +4221,12 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB43_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -4320,9 +4247,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB43_2 ; GFX1250-SDAG-NEXT: .LBB43_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -4337,8 +4263,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -4347,7 +4272,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -4372,10 +4297,9 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB43_2 ; GFX1250-GISEL-NEXT: .LBB43_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -4489,11 +4413,10 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -4515,9 +4438,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB44_2 ; GFX1250-SDAG-NEXT: .LBB44_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -4529,15 +4451,14 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB44_3 @@ -4558,10 +4479,9 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB44_2 ; GFX1250-GISEL-NEXT: .LBB44_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -4657,11 +4577,9 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB45_3 @@ -4682,9 +4600,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB45_2 ; GFX1250-SDAG-NEXT: .LBB45_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -4696,8 +4613,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4707,7 +4623,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB45_3 @@ -4728,10 +4644,9 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB45_2 ; GFX1250-GISEL-NEXT: .LBB45_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -5004,11 +4919,10 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -5033,9 +4947,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB50_2 ; GFX1250-SDAG-NEXT: .LBB50_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -5050,14 +4963,13 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -5082,10 +4994,9 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB50_2 ; GFX1250-GISEL-NEXT: .LBB50_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -5196,13 +5107,12 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB51_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -5223,9 +5133,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB51_2 ; GFX1250-SDAG-NEXT: .LBB51_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -5240,8 +5149,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -5250,7 +5158,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -5275,10 +5183,9 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB51_2 ; GFX1250-GISEL-NEXT: .LBB51_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -5392,11 +5299,10 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -5418,9 +5324,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB52_2 ; GFX1250-SDAG-NEXT: .LBB52_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -5432,15 +5337,14 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB52_3 @@ -5461,10 +5365,9 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB52_2 ; GFX1250-GISEL-NEXT: .LBB52_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -5560,11 +5463,9 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB53_3 @@ -5585,9 +5486,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB53_2 ; GFX1250-SDAG-NEXT: .LBB53_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -5599,8 +5499,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5610,7 +5509,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB53_3 @@ -5631,10 +5530,9 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB53_2 ; GFX1250-GISEL-NEXT: .LBB53_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -5877,11 +5775,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_max_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -5902,9 +5799,8 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB58_2 ; GFX1250-SDAG-NEXT: .LBB58_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -5918,14 +5814,13 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -5946,10 +5841,9 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB58_2 ; GFX1250-GISEL-NEXT: .LBB58_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -6061,13 +5955,12 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB59_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -6084,9 +5977,8 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB59_2 ; GFX1250-SDAG-NEXT: .LBB59_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -6100,8 +5992,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -6110,7 +6001,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -6131,10 +6022,9 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB59_2 ; GFX1250-GISEL-NEXT: .LBB59_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -6249,11 +6139,10 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -6272,9 +6161,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB60_2 ; GFX1250-SDAG-NEXT: .LBB60_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -6285,15 +6173,14 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB60_3 @@ -6311,10 +6198,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB60_2 ; GFX1250-GISEL-NEXT: .LBB60_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -6409,11 +6295,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB61_3 @@ -6431,9 +6315,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB61_2 ; GFX1250-SDAG-NEXT: .LBB61_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -6444,8 +6327,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6455,7 +6337,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB61_3 @@ -6473,10 +6355,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB61_2 ; GFX1250-GISEL-NEXT: .LBB61_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -6718,11 +6599,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_min_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -6743,9 +6623,8 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB66_2 ; GFX1250-SDAG-NEXT: .LBB66_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -6759,14 +6638,13 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -6787,10 +6665,9 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB66_2 ; GFX1250-GISEL-NEXT: .LBB66_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -6902,13 +6779,12 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB67_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -6925,9 +6801,8 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB67_2 ; GFX1250-SDAG-NEXT: .LBB67_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -6941,8 +6816,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -6951,7 +6825,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -6972,10 +6846,9 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB67_2 ; GFX1250-GISEL-NEXT: .LBB67_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -7090,11 +6963,10 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -7113,9 +6985,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB68_2 ; GFX1250-SDAG-NEXT: .LBB68_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -7126,15 +6997,14 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB68_3 @@ -7152,10 +7022,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB68_2 ; GFX1250-GISEL-NEXT: .LBB68_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -7250,11 +7119,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB69_3 @@ -7272,9 +7139,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB69_2 ; GFX1250-SDAG-NEXT: .LBB69_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -7285,8 +7151,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7296,7 +7161,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB69_3 @@ -7314,10 +7179,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB69_2 ; GFX1250-GISEL-NEXT: .LBB69_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -7559,11 +7423,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -7584,9 +7447,8 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB74_2 ; GFX1250-SDAG-NEXT: .LBB74_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -7600,14 +7462,13 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -7628,10 +7489,9 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB74_2 ; GFX1250-GISEL-NEXT: .LBB74_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -7743,13 +7603,12 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB75_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -7766,9 +7625,8 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB75_2 ; GFX1250-SDAG-NEXT: .LBB75_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -7782,8 +7640,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -7792,7 +7649,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -7813,10 +7670,9 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB75_2 ; GFX1250-GISEL-NEXT: .LBB75_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -7931,11 +7787,10 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -7954,9 +7809,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB76_2 ; GFX1250-SDAG-NEXT: .LBB76_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -7967,15 +7821,14 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB76_3 @@ -7993,10 +7846,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB76_2 ; GFX1250-GISEL-NEXT: .LBB76_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -8091,11 +7943,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB77_3 @@ -8113,9 +7963,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB77_2 ; GFX1250-SDAG-NEXT: .LBB77_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -8126,8 +7975,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8137,7 +7985,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB77_3 @@ -8155,10 +8003,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB77_2 ; GFX1250-GISEL-NEXT: .LBB77_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -8400,11 +8247,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -8425,9 +8271,8 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB82_2 ; GFX1250-SDAG-NEXT: .LBB82_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -8441,14 +8286,13 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -8469,10 +8313,9 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB82_2 ; GFX1250-GISEL-NEXT: .LBB82_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -8584,13 +8427,12 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB83_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -8607,9 +8449,8 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB83_2 ; GFX1250-SDAG-NEXT: .LBB83_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -8623,8 +8464,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -8633,7 +8473,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -8654,10 +8494,9 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB83_2 ; GFX1250-GISEL-NEXT: .LBB83_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -8772,11 +8611,10 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -8795,9 +8633,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB84_2 ; GFX1250-SDAG-NEXT: .LBB84_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -8808,15 +8645,14 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB84_3 @@ -8834,10 +8670,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB84_2 ; GFX1250-GISEL-NEXT: .LBB84_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -8932,11 +8767,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB85_3 @@ -8954,9 +8787,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB85_2 ; GFX1250-SDAG-NEXT: .LBB85_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -8967,8 +8799,7 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8978,7 +8809,7 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB85_3 @@ -8996,10 +8827,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB85_2 ; GFX1250-GISEL-NEXT: .LBB85_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -9281,12 +9111,11 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v4, v3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v3 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -9311,9 +9140,8 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB90_2 ; GFX1250-SDAG-NEXT: .LBB90_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v2 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v8, off @@ -9328,15 +9156,14 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v10, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v0, v5 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v3, v10 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v3 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -9361,10 +9188,9 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB90_2 ; GFX1250-GISEL-NEXT: .LBB90_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -9485,13 +9311,12 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v3 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v3 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB91_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -9512,9 +9337,8 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB91_2 ; GFX1250-SDAG-NEXT: .LBB91_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v2 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v8, off @@ -9529,10 +9353,9 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v10, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -9540,7 +9363,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v3, v10 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v3 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -9565,10 +9388,9 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB91_2 ; GFX1250-GISEL-NEXT: .LBB91_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -9690,13 +9512,12 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v4, v3 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v2 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -9718,9 +9539,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB92_2 ; GFX1250-SDAG-NEXT: .LBB92_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v2, off @@ -9732,15 +9552,15 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, src_flat_scratch_base_hi :: v_dual_mov_b32 v8, v1 ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v1, v3, v5 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB92_3 @@ -9761,10 +9581,9 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB92_2 ; GFX1250-GISEL-NEXT: .LBB92_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -9869,11 +9688,9 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v2 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB93_3 @@ -9894,9 +9711,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB93_2 ; GFX1250-SDAG-NEXT: .LBB93_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v2, off @@ -9908,18 +9724,18 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, src_flat_scratch_base_hi :: v_dual_mov_b32 v8, v1 ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v1, v3, v5 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB93_3 @@ -9940,10 +9756,9 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB93_2 ; GFX1250-GISEL-NEXT: .LBB93_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -10188,11 +10003,10 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -10214,10 +10028,9 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB98_2 ; GFX1250-SDAG-NEXT: .LBB98_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off @@ -10233,14 +10046,13 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -10262,18 +10074,17 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB98_2 ; GFX1250-GISEL-NEXT: .LBB98_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 @@ -10386,13 +10197,12 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB99_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -10410,10 +10220,9 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB99_2 ; GFX1250-SDAG-NEXT: .LBB99_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off @@ -10429,8 +10238,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -10439,7 +10247,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -10461,18 +10269,17 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB99_2 ; GFX1250-GISEL-NEXT: .LBB99_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 @@ -10588,11 +10395,10 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -10610,9 +10416,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB100_2 ; GFX1250-SDAG-NEXT: .LBB100_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off @@ -10625,15 +10430,14 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB100_3 @@ -10650,17 +10454,16 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB100_2 ; GFX1250-GISEL-NEXT: .LBB100_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm @@ -10754,11 +10557,9 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB101_3 @@ -10775,9 +10576,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB101_2 ; GFX1250-SDAG-NEXT: .LBB101_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off @@ -10790,8 +10590,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10801,7 +10600,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB101_3 @@ -10818,17 +10617,16 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB101_2 ; GFX1250-GISEL-NEXT: .LBB101_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm @@ -11064,11 +10862,10 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -11090,10 +10887,9 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB106_2 ; GFX1250-SDAG-NEXT: .LBB106_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s0, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off @@ -11112,14 +10908,13 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -11141,11 +10936,10 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB106_2 ; GFX1250-GISEL-NEXT: .LBB106_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -11268,13 +11062,12 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB107_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -11292,10 +11085,9 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB107_2 ; GFX1250-SDAG-NEXT: .LBB107_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s0, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off @@ -11314,8 +11106,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -11324,7 +11115,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -11346,11 +11137,10 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB107_2 ; GFX1250-GISEL-NEXT: .LBB107_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -11476,11 +11266,10 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -11498,9 +11287,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB108_2 ; GFX1250-SDAG-NEXT: .LBB108_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -11516,15 +11304,14 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB108_3 @@ -11541,10 +11328,9 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB108_2 ; GFX1250-GISEL-NEXT: .LBB108_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -11648,11 +11434,9 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB109_3 @@ -11669,9 +11453,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB109_2 ; GFX1250-SDAG-NEXT: .LBB109_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -11687,8 +11470,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -11698,7 +11480,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB109_3 @@ -11715,10 +11497,9 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB109_2 ; GFX1250-GISEL-NEXT: .LBB109_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -11834,12 +11615,10 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB110_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %atomicrmw.check.private -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB110_4 ; GFX1250-SDAG-NEXT: ; %bb.2: ; %atomicrmw.global @@ -11855,9 +11634,7 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: .LBB110_4: ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: .LBB110_5: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s2 @@ -11891,10 +11668,9 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB110_6 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.check.private -; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s2, 0x4000000 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB110_3 @@ -11910,9 +11686,7 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB110_5 ; GFX1250-GISEL-NEXT: ; %bb.4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-GISEL-NEXT: s_cselect_b32 s2, s2, -1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s2 @@ -12060,12 +11834,10 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: .LBB111_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-SDAG-NEXT: .LBB111_3: ; %atomicrmw.check.private -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX1250-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB111_5 @@ -12079,9 +11851,7 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB111_7 ; GFX1250-SDAG-NEXT: ; %bb.6: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s2 @@ -12112,9 +11882,8 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s1, s3 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB111_6 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.check.private -; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-GISEL-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s2, 0x4000000 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB111_3 @@ -12130,9 +11899,7 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB111_5 ; GFX1250-GISEL-NEXT: ; %bb.4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-GISEL-NEXT: s_cselect_b32 s2, s2, -1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s2 @@ -12261,9 +12028,8 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -12279,10 +12045,9 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: .LBB112_2: ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: .LBB112_3: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -12301,10 +12066,9 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0 -; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4 +; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB112_2 @@ -12320,10 +12084,9 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB112_4 ; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo -; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 +; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -12413,9 +12176,8 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -12434,11 +12196,10 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB113_2 ; GFX1250-SDAG-NEXT: .LBB113_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 -; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -12454,9 +12215,8 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0 -; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4 +; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB113_2 @@ -12473,10 +12233,9 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB113_4 ; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo -; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 +; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -12562,9 +12321,8 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -12580,10 +12338,9 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: .LBB114_2: ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: .LBB114_3: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -12602,10 +12359,9 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0 -; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4 +; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB114_2 @@ -12621,10 +12377,9 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB114_4 ; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo -; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 +; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -12714,9 +12469,8 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -12735,11 +12489,10 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB115_2 ; GFX1250-SDAG-NEXT: .LBB115_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 -; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -12755,9 +12508,8 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0 -; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4 +; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB115_2 @@ -12774,10 +12526,9 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB115_4 ; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo -; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 +; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 |