aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll231
-rw-r--r--llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll25
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-attributor-flat-scratch-init-asan.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll410
-rw-r--r--llvm/test/CodeGen/AMDGPU/bf16.ll34
-rw-r--r--llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/calling-conventions.ll202
-rw-r--r--llvm/test/CodeGen/AMDGPU/carryout-selection.ll166
-rw-r--r--llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir52
-rw-r--r--llvm/test/CodeGen/AMDGPU/ds_write2.ll19
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll1081
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmax3.ll228
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmin3.ll300
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll151
-rw-r--r--llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir12
-rw-r--r--llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll64
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll28
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll54
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir6
-rw-r--r--llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad_64_32.ll13
-rw-r--r--llvm/test/CodeGen/AMDGPU/mai-hazards.mir6
-rw-r--r--llvm/test/CodeGen/AMDGPU/max.ll55
-rw-r--r--llvm/test/CodeGen/AMDGPU/min.ll129
-rw-r--r--llvm/test/CodeGen/AMDGPU/mul.ll81
-rw-r--r--llvm/test/CodeGen/AMDGPU/packed-fp32.ll444
-rw-r--r--llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/preload-kernargs.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir16
-rw-r--r--llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir8
-rw-r--r--llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir12
-rw-r--r--llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir24
-rw-r--r--llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll90
-rw-r--r--llvm/test/CodeGen/AMDGPU/true16-fold.mir25
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/wait-xcnt.mir9
-rw-r--r--llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll72
45 files changed, 2037 insertions, 2197 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
index 41fda6d..efa51ea 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
@@ -90,26 +90,24 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v10, v[0:1], off offset:8
; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v11, v[0:1], off offset:9
; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v12, v[0:1], off offset:11
-; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v0, v[0:1], off offset:10
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v13, v[0:1], off offset:10
; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0xa
; GFX1250-NOUNALIGNED-NEXT: s_wait_xcnt 0x0
-; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
+; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v3, 8, v2
; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x8
-; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v3, 16, v4 :: v_dual_lshlrev_b32 v2, 24, v5
+; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v2, 16, v4 :: v_dual_lshlrev_b32 v1, 24, v5
; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x6
-; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v6
+; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v6
; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x4
-; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v6, 16, v8 :: v_dual_lshlrev_b32 v5, 24, v9
+; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v5, 16, v8 :: v_dual_lshlrev_b32 v4, 24, v9
+; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v0, v1, v2, v0
; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x2
-; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v7, v11, 8, v10
-; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x1
-; GFX1250-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v12
+; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v10
; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 16, v0
-; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v0, v2, v3, v1
-; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v1, v5, v6, v4
-; GFX1250-NOUNALIGNED-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v2, v8, v9, v7
+; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v7, 24, v12 :: v_dual_lshlrev_b32 v8, 16, v13
+; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v1, v4, v5, v3
+; GFX1250-NOUNALIGNED-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v2, v7, v8, v6
; GFX1250-NOUNALIGNED-NEXT: s_set_pc_i64 s[30:31]
;
; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
@@ -942,7 +940,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg
;
; GFX1250-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1:
; GFX1250-NOUNALIGNED: ; %bb.0:
-; GFX1250-NOUNALIGNED-NEXT: s_clause 0xa
+; GFX1250-NOUNALIGNED-NEXT: s_clause 0xb
; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s2, s[0:1], 0x1
; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s3, s[0:1], 0x3
; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s4, s[0:1], 0x2
@@ -954,27 +952,26 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg
; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s10, s[0:1], 0x0
; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s11, s[0:1], 0x4
; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s12, s[0:1], 0xa
-; GFX1250-NOUNALIGNED-NEXT: s_wait_xcnt 0x0
-; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s1, s[0:1], 0x8
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s13, s[0:1], 0x8
; GFX1250-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s0, s2, 8
-; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s2, s3, 24
-; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s3, s4, 16
-; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s4, s5, 8
-; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s3
-; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s5, s6, 24
-; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s6, s7, 16
-; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s7, s8, 8
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s1, s3, 24
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s2, s4, 16
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s3, s5, 8
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s1, s1, s2
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s4, s6, 24
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s5, s7, 16
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s6, s8, 8
; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s0, s0, s10
-; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s8, s9, 24
-; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s0, s2, s0
-; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s2, s12, 16
-; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s3, s4, s11
-; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s4, s5, s6
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s7, s9, 24
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s0, s1, s0
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s1, s12, 16
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s3, s11
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s3, s4, s5
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s4, s6, s13
; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s5, s7, s1
-; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s8, s2
-; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s1, s4, s3
-; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s5
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s1, s3, s2
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s5, s4
; GFX1250-NOUNALIGNED-NEXT: ; return to shader part epilog
;
; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
@@ -1351,11 +1348,25 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg
}
define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg %ptr) {
-; GFX12-LABEL: s_load_constant_v3i32_align4:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ; return to shader part epilog
+; GFX12-UNALIGNED-LABEL: s_load_constant_v3i32_align4:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT: ; return to shader part epilog
+;
+; GFX12-NOUNALIGNED-LABEL: s_load_constant_v3i32_align4:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: s_load_constant_v3i32_align4:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mov_b32 s4, s0
+; GFX1250-NEXT: s_mov_b32 s5, s1
+; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_load_constant_v3i32_align4:
; GFX9: ; %bb.0:
@@ -1388,11 +1399,25 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg
}
define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) {
-; GFX12-LABEL: s_load_constant_i96_align8:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ; return to shader part epilog
+; GFX12-UNALIGNED-LABEL: s_load_constant_i96_align8:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT: ; return to shader part epilog
+;
+; GFX12-NOUNALIGNED-LABEL: s_load_constant_i96_align8:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: s_load_constant_i96_align8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mov_b32 s4, s0
+; GFX1250-NEXT: s_mov_b32 s5, s1
+; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_load_constant_i96_align8:
; GFX9: ; %bb.0:
@@ -1425,11 +1450,25 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) {
}
define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg %ptr) {
-; GFX12-LABEL: s_load_constant_v3i32_align8:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ; return to shader part epilog
+; GFX12-UNALIGNED-LABEL: s_load_constant_v3i32_align8:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT: ; return to shader part epilog
+;
+; GFX12-NOUNALIGNED-LABEL: s_load_constant_v3i32_align8:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: s_load_constant_v3i32_align8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mov_b32 s4, s0
+; GFX1250-NEXT: s_mov_b32 s5, s1
+; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_load_constant_v3i32_align8:
; GFX9: ; %bb.0:
@@ -1462,11 +1501,25 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg
}
define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg %ptr) {
-; GFX12-LABEL: s_load_constant_v6i16_align8:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ; return to shader part epilog
+; GFX12-UNALIGNED-LABEL: s_load_constant_v6i16_align8:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT: ; return to shader part epilog
+;
+; GFX12-NOUNALIGNED-LABEL: s_load_constant_v6i16_align8:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: s_load_constant_v6i16_align8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mov_b32 s4, s0
+; GFX1250-NEXT: s_mov_b32 s5, s1
+; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_load_constant_v6i16_align8:
; GFX9: ; %bb.0:
@@ -1500,24 +1553,64 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg
}
define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg %ptr) {
-; GFX12-LABEL: s_load_constant_v12i8_align8:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s13, s0, 8
-; GFX12-NEXT: s_lshr_b32 s12, s0, 16
-; GFX12-NEXT: s_lshr_b32 s3, s0, 24
-; GFX12-NEXT: s_lshr_b32 s5, s1, 8
-; GFX12-NEXT: s_lshr_b32 s6, s1, 16
-; GFX12-NEXT: s_lshr_b32 s7, s1, 24
-; GFX12-NEXT: s_lshr_b32 s9, s2, 8
-; GFX12-NEXT: s_lshr_b32 s10, s2, 16
-; GFX12-NEXT: s_lshr_b32 s11, s2, 24
-; GFX12-NEXT: s_mov_b32 s4, s1
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s1, s13
-; GFX12-NEXT: s_mov_b32 s2, s12
-; GFX12-NEXT: ; return to shader part epilog
+; GFX12-UNALIGNED-LABEL: s_load_constant_v12i8_align8:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_lshr_b32 s13, s0, 8
+; GFX12-UNALIGNED-NEXT: s_lshr_b32 s12, s0, 16
+; GFX12-UNALIGNED-NEXT: s_lshr_b32 s3, s0, 24
+; GFX12-UNALIGNED-NEXT: s_lshr_b32 s5, s1, 8
+; GFX12-UNALIGNED-NEXT: s_lshr_b32 s6, s1, 16
+; GFX12-UNALIGNED-NEXT: s_lshr_b32 s7, s1, 24
+; GFX12-UNALIGNED-NEXT: s_lshr_b32 s9, s2, 8
+; GFX12-UNALIGNED-NEXT: s_lshr_b32 s10, s2, 16
+; GFX12-UNALIGNED-NEXT: s_lshr_b32 s11, s2, 24
+; GFX12-UNALIGNED-NEXT: s_mov_b32 s4, s1
+; GFX12-UNALIGNED-NEXT: s_mov_b32 s8, s2
+; GFX12-UNALIGNED-NEXT: s_mov_b32 s1, s13
+; GFX12-UNALIGNED-NEXT: s_mov_b32 s2, s12
+; GFX12-UNALIGNED-NEXT: ; return to shader part epilog
+;
+; GFX12-NOUNALIGNED-LABEL: s_load_constant_v12i8_align8:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s13, s0, 8
+; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s12, s0, 16
+; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s3, s0, 24
+; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s5, s1, 8
+; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s6, s1, 16
+; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s7, s1, 24
+; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s9, s2, 8
+; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s10, s2, 16
+; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s11, s2, 24
+; GFX12-NOUNALIGNED-NEXT: s_mov_b32 s4, s1
+; GFX12-NOUNALIGNED-NEXT: s_mov_b32 s8, s2
+; GFX12-NOUNALIGNED-NEXT: s_mov_b32 s1, s13
+; GFX12-NOUNALIGNED-NEXT: s_mov_b32 s2, s12
+; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: s_load_constant_v12i8_align8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mov_b32 s4, s0
+; GFX1250-NEXT: s_mov_b32 s5, s1
+; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_lshr_b32 s13, s0, 8
+; GFX1250-NEXT: s_lshr_b32 s12, s0, 16
+; GFX1250-NEXT: s_lshr_b32 s3, s0, 24
+; GFX1250-NEXT: s_lshr_b32 s5, s1, 8
+; GFX1250-NEXT: s_lshr_b32 s6, s1, 16
+; GFX1250-NEXT: s_lshr_b32 s7, s1, 24
+; GFX1250-NEXT: s_lshr_b32 s9, s2, 8
+; GFX1250-NEXT: s_lshr_b32 s10, s2, 16
+; GFX1250-NEXT: s_lshr_b32 s11, s2, 24
+; GFX1250-NEXT: s_mov_b32 s4, s1
+; GFX1250-NEXT: s_mov_b32 s8, s2
+; GFX1250-NEXT: s_mov_b32 s1, s13
+; GFX1250-NEXT: s_mov_b32 s2, s12
+; GFX1250-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_load_constant_v12i8_align8:
; GFX9: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
index aac499f..b486fabb 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
@@ -9,15 +9,14 @@ target triple = "amdgcn-amd-amdhsa"
define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
; GFX1250-SDAG-LABEL: use_private_to_flat_addrspacecast:
; GFX1250-SDAG: ; %bb.0:
-; GFX1250-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX1250-SDAG-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_lshlrev_b32 v1, 20, v0
-; GFX1250-SDAG-NEXT: s_cmp_lg_u32 s2, -1
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_lshlrev_b32 v1, 20, v0
+; GFX1250-SDAG-NEXT: s_cmp_lg_u32 s0, -1
; GFX1250-SDAG-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], src_flat_scratch_base_lo, v[0:1]
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0, v1
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
@@ -56,13 +55,11 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast_nonnull(ptr addrspa
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX1250-SDAG-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 20, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], src_flat_scratch_base_lo, v[0:1]
; GFX1250-SDAG-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: s_endpgm
@@ -91,10 +88,9 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) {
; GFX1250-LABEL: use_flat_to_private_addrspacecast:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_sub_co_i32 s2, s0, s2
+; GFX1250-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo
; GFX1250-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1250-NEXT: s_cselect_b32 s0, s2, -1
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
@@ -110,9 +106,8 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast_nonnull(ptr %ptr) {
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: s_sub_co_i32 s0, s0, s1
+; GFX1250-SDAG-NEXT: s_sub_co_i32 s0, s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: s_endpgm
@@ -122,9 +117,7 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast_nonnull(ptr %ptr) {
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s0, s1
+; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-flat-scratch-init-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-flat-scratch-init-asan.ll
new file mode 100644
index 0000000..0d68762
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-flat-scratch-init-asan.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes='amdgpu-attributor' %s -o - | FileCheck %s
+
+@lds_1 = internal addrspace(3) global [1 x i8] poison, align 4
+
+;.
+; CHECK: @lds_1 = internal addrspace(3) global [1 x i8] poison, align 4
+;.
+define amdgpu_kernel void @k0() #0 {
+; CHECK: Function Attrs: sanitize_address
+; CHECK-LABEL: define amdgpu_kernel void @k0(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: store i8 7, ptr addrspace(3) @lds_1, align 4
+; CHECK-NEXT: ret void
+;
+ store i8 7, ptr addrspace(3) @lds_1, align 4
+ ret void
+}
+
+attributes #0 = { sanitize_address }
+; "amdgpu-no-flat-scratch-init" attribute should not be present in attribute list
+;.
+; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
index a688b6f..fb566e5 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -707,8 +707,8 @@ attributes #6 = { "enqueued-block" }
; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR15:[0-9]+]] = { nounwind "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind sanitize_address "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind sanitize_address "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR19:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR20:[0-9]+]] = { "enqueued-block" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "enqueued-block" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll
index ef52694..54871a6 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll
@@ -538,58 +538,61 @@ define double @flat_system_atomic_fadd_f64(ptr %ptr, double %val) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX1250-NEXT: s_mov_b64 s[0:1], src_shared_base
; GFX1250-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1250-NEXT: s_cbranch_execz .LBB34_6
-; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.check.private
-; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_hi
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_xor_b32_e32 v4, s1, v1
-; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
-; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT: s_cbranch_execnz .LBB34_3
+; GFX1250-NEXT: ; %bb.1: ; %Flow2
+; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB34_8
+; GFX1250-NEXT: .LBB34_2: ; %atomicrmw.phi
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+; GFX1250-NEXT: .LBB34_3: ; %atomicrmw.check.private
+; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
+; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-NEXT: s_and_saveexec_b32 s1, vcc_lo
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_xor_b32 s1, exec_lo, s1
-; GFX1250-NEXT: s_cbranch_execz .LBB34_3
-; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.global
-; GFX1250-NEXT: global_atomic_add_f64 v[4:5], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: s_cbranch_execz .LBB34_5
+; GFX1250-NEXT: ; %bb.4: ; %atomicrmw.global
+; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-NEXT: .LBB34_3: ; %Flow
+; GFX1250-NEXT: .LBB34_5: ; %Flow
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_and_not1_saveexec_b32 s1, s1
-; GFX1250-NEXT: s_cbranch_execz .LBB34_5
-; GFX1250-NEXT: ; %bb.4: ; %atomicrmw.private
-; GFX1250-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
-; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT: s_cbranch_execz .LBB34_7
+; GFX1250-NEXT: ; %bb.6: ; %atomicrmw.private
+; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s2, v0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
-; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[4:5], v[2:3]
-; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off
-; GFX1250-NEXT: .LBB34_5: ; %Flow1
+; GFX1250-NEXT: v_add_f64_e32 v[2:3], v[0:1], v[2:3]
+; GFX1250-NEXT: scratch_store_b64 v4, v[2:3], off
+; GFX1250-NEXT: .LBB34_7: ; %Flow1
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-NEXT: .LBB34_6: ; %Flow2
; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX1250-NEXT: s_cbranch_execz .LBB34_8
-; GFX1250-NEXT: ; %bb.7: ; %atomicrmw.shared
-; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
+; GFX1250-NEXT: s_cbranch_execz .LBB34_2
+; GFX1250-NEXT: .LBB34_8: ; %atomicrmw.shared
+; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: ds_add_rtn_f64 v[4:5], v0, v[2:3]
-; GFX1250-NEXT: .LBB34_8: ; %atomicrmw.phi
+; GFX1250-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo
+; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = atomicrmw fadd ptr %ptr, double %val monotonic
ret double %result
@@ -600,58 +603,61 @@ define double @flat_one_as_atomic_fadd_f64(ptr %ptr, double %val) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX1250-NEXT: s_mov_b64 s[0:1], src_shared_base
; GFX1250-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1250-NEXT: s_cbranch_execz .LBB35_6
-; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.check.private
-; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_hi
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_xor_b32_e32 v4, s1, v1
-; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
-; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT: s_cbranch_execnz .LBB35_3
+; GFX1250-NEXT: ; %bb.1: ; %Flow2
+; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB35_8
+; GFX1250-NEXT: .LBB35_2: ; %atomicrmw.phi
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+; GFX1250-NEXT: .LBB35_3: ; %atomicrmw.check.private
+; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
+; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-NEXT: s_and_saveexec_b32 s1, vcc_lo
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_xor_b32 s1, exec_lo, s1
-; GFX1250-NEXT: s_cbranch_execz .LBB35_3
-; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.global
-; GFX1250-NEXT: global_atomic_add_f64 v[4:5], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: s_cbranch_execz .LBB35_5
+; GFX1250-NEXT: ; %bb.4: ; %atomicrmw.global
+; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-NEXT: .LBB35_3: ; %Flow
+; GFX1250-NEXT: .LBB35_5: ; %Flow
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_and_not1_saveexec_b32 s1, s1
-; GFX1250-NEXT: s_cbranch_execz .LBB35_5
-; GFX1250-NEXT: ; %bb.4: ; %atomicrmw.private
-; GFX1250-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
-; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT: s_cbranch_execz .LBB35_7
+; GFX1250-NEXT: ; %bb.6: ; %atomicrmw.private
+; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s2, v0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
-; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[4:5], v[2:3]
-; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off
-; GFX1250-NEXT: .LBB35_5: ; %Flow1
+; GFX1250-NEXT: v_add_f64_e32 v[2:3], v[0:1], v[2:3]
+; GFX1250-NEXT: scratch_store_b64 v4, v[2:3], off
+; GFX1250-NEXT: .LBB35_7: ; %Flow1
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-NEXT: .LBB35_6: ; %Flow2
; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX1250-NEXT: s_cbranch_execz .LBB35_8
-; GFX1250-NEXT: ; %bb.7: ; %atomicrmw.shared
-; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
+; GFX1250-NEXT: s_cbranch_execz .LBB35_2
+; GFX1250-NEXT: .LBB35_8: ; %atomicrmw.shared
+; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: ds_add_rtn_f64 v[4:5], v0, v[2:3]
-; GFX1250-NEXT: .LBB35_8: ; %atomicrmw.phi
+; GFX1250-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo
+; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = atomicrmw fadd ptr %ptr, double %val syncscope("one-as") monotonic
ret double %result
@@ -686,40 +692,42 @@ define double @flat_system_atomic_fmin_f64(ptr %ptr, double %val) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1
-; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
-; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
+; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
+; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1250-NEXT: s_cbranch_execz .LBB38_2
-; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX1250-NEXT: flat_atomic_min_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: s_cbranch_execnz .LBB38_3
+; GFX1250-NEXT: ; %bb.1: ; %Flow
+; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB38_4
+; GFX1250-NEXT: .LBB38_2: ; %atomicrmw.phi
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+; GFX1250-NEXT: .LBB38_3: ; %atomicrmw.global
+; GFX1250-NEXT: flat_atomic_min_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-NEXT: .LBB38_2: ; %Flow
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX1250-NEXT: s_cbranch_execz .LBB38_4
-; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
-; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT: s_cbranch_execz .LBB38_2
+; GFX1250-NEXT: .LBB38_4: ; %atomicrmw.private
+; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo
-; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v0, vcc_lo
+; GFX1250-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
-; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off
-; GFX1250-NEXT: .LBB38_4: ; %atomicrmw.phi
+; GFX1250-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
+; GFX1250-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[2:3]
+; GFX1250-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = atomicrmw fmin ptr %ptr, double %val monotonic
ret double %result
@@ -730,40 +738,42 @@ define double @flat_one_as_atomic_fmin_f64(ptr %ptr, double %val) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1
-; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
-; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
+; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
+; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1250-NEXT: s_cbranch_execz .LBB39_2
-; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX1250-NEXT: flat_atomic_min_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: s_cbranch_execnz .LBB39_3
+; GFX1250-NEXT: ; %bb.1: ; %Flow
+; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB39_4
+; GFX1250-NEXT: .LBB39_2: ; %atomicrmw.phi
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+; GFX1250-NEXT: .LBB39_3: ; %atomicrmw.global
+; GFX1250-NEXT: flat_atomic_min_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-NEXT: .LBB39_2: ; %Flow
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX1250-NEXT: s_cbranch_execz .LBB39_4
-; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
-; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT: s_cbranch_execz .LBB39_2
+; GFX1250-NEXT: .LBB39_4: ; %atomicrmw.private
+; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo
-; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v0, vcc_lo
+; GFX1250-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
-; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off
-; GFX1250-NEXT: .LBB39_4: ; %atomicrmw.phi
+; GFX1250-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
+; GFX1250-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[2:3]
+; GFX1250-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = atomicrmw fmin ptr %ptr, double %val syncscope("one-as") monotonic
ret double %result
@@ -798,40 +808,42 @@ define double @flat_system_atomic_fmax_f64(ptr %ptr, double %val) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1
-; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
-; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
+; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
+; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1250-NEXT: s_cbranch_execz .LBB42_2
-; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX1250-NEXT: flat_atomic_max_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: s_cbranch_execnz .LBB42_3
+; GFX1250-NEXT: ; %bb.1: ; %Flow
+; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB42_4
+; GFX1250-NEXT: .LBB42_2: ; %atomicrmw.phi
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+; GFX1250-NEXT: .LBB42_3: ; %atomicrmw.global
+; GFX1250-NEXT: flat_atomic_max_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-NEXT: .LBB42_2: ; %Flow
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX1250-NEXT: s_cbranch_execz .LBB42_4
-; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
-; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT: s_cbranch_execz .LBB42_2
+; GFX1250-NEXT: .LBB42_4: ; %atomicrmw.private
+; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo
-; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v0, vcc_lo
+; GFX1250-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
-; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off
-; GFX1250-NEXT: .LBB42_4: ; %atomicrmw.phi
+; GFX1250-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
+; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[2:3]
+; GFX1250-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = atomicrmw fmax ptr %ptr, double %val monotonic
ret double %result
@@ -842,40 +854,42 @@ define double @flat_one_as_atomic_fmax_f64(ptr %ptr, double %val) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1
-; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
-; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
+; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
+; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1250-NEXT: s_cbranch_execz .LBB43_2
-; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX1250-NEXT: flat_atomic_max_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: s_cbranch_execnz .LBB43_3
+; GFX1250-NEXT: ; %bb.1: ; %Flow
+; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB43_4
+; GFX1250-NEXT: .LBB43_2: ; %atomicrmw.phi
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+; GFX1250-NEXT: .LBB43_3: ; %atomicrmw.global
+; GFX1250-NEXT: flat_atomic_max_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-NEXT: .LBB43_2: ; %Flow
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX1250-NEXT: s_cbranch_execz .LBB43_4
-; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
-; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT: s_cbranch_execz .LBB43_2
+; GFX1250-NEXT: .LBB43_4: ; %atomicrmw.private
+; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo
-; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v0, vcc_lo
+; GFX1250-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
-; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off
-; GFX1250-NEXT: .LBB43_4: ; %atomicrmw.phi
+; GFX1250-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
+; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[2:3]
+; GFX1250-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = atomicrmw fmax ptr %ptr, double %val syncscope("one-as") monotonic
ret double %result
@@ -982,13 +996,11 @@ define i64 @flat_one_as_atomic_min_i64(ptr %ptr, i64 %val) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-NEXT: s_cbranch_execz .LBB52_2
; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global
@@ -1000,10 +1012,9 @@ define i64 @flat_one_as_atomic_min_i64(ptr %ptr, i64 %val) {
; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-NEXT: s_cbranch_execz .LBB52_4
; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
@@ -1025,13 +1036,11 @@ define i64 @flat_system_atomic_min_i64(ptr %ptr, i64 %val) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-NEXT: s_cbranch_execz .LBB53_2
; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global
@@ -1043,10 +1052,9 @@ define i64 @flat_system_atomic_min_i64(ptr %ptr, i64 %val) {
; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-NEXT: s_cbranch_execz .LBB53_4
; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
@@ -1068,13 +1076,11 @@ define i64 @flat_one_as_atomic_max_i64(ptr %ptr, i64 %val) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-NEXT: s_cbranch_execz .LBB54_2
; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global
@@ -1086,10 +1092,9 @@ define i64 @flat_one_as_atomic_max_i64(ptr %ptr, i64 %val) {
; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-NEXT: s_cbranch_execz .LBB54_4
; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
@@ -1111,13 +1116,11 @@ define i64 @flat_system_atomic_max_i64(ptr %ptr, i64 %val) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-NEXT: s_cbranch_execz .LBB55_2
; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global
@@ -1129,10 +1132,9 @@ define i64 @flat_system_atomic_max_i64(ptr %ptr, i64 %val) {
; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-NEXT: s_cbranch_execz .LBB55_4
; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
@@ -1154,13 +1156,11 @@ define i64 @flat_one_as_atomic_umin_i64(ptr %ptr, i64 %val) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-NEXT: s_cbranch_execz .LBB56_2
; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global
@@ -1172,10 +1172,9 @@ define i64 @flat_one_as_atomic_umin_i64(ptr %ptr, i64 %val) {
; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-NEXT: s_cbranch_execz .LBB56_4
; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
@@ -1197,13 +1196,11 @@ define i64 @flat_system_atomic_umin_i64(ptr %ptr, i64 %val) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-NEXT: s_cbranch_execz .LBB57_2
; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global
@@ -1215,10 +1212,9 @@ define i64 @flat_system_atomic_umin_i64(ptr %ptr, i64 %val) {
; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-NEXT: s_cbranch_execz .LBB57_4
; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
@@ -1240,13 +1236,11 @@ define i64 @flat_one_as_atomic_umax_i64(ptr %ptr, i64 %val) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-NEXT: s_cbranch_execz .LBB58_2
; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global
@@ -1258,10 +1252,9 @@ define i64 @flat_one_as_atomic_umax_i64(ptr %ptr, i64 %val) {
; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-NEXT: s_cbranch_execz .LBB58_4
; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
@@ -1283,13 +1276,11 @@ define i64 @flat_system_atomic_umax_i64(ptr %ptr, i64 %val) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-NEXT: s_cbranch_execz .LBB59_2
; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global
@@ -1301,10 +1292,9 @@ define i64 @flat_system_atomic_umax_i64(ptr %ptr, i64 %val) {
; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-NEXT: s_cbranch_execz .LBB59_4
; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 94ba5cd..6b5647e 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -569,10 +569,10 @@ define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX1250-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
; GFX1250-NEXT: s_clause 0x1
-; GFX1250-NEXT: global_load_b128 v[0:3], v[4:5], off
-; GFX1250-NEXT: global_load_b128 v[4:7], v[4:5], off offset:16
+; GFX1250-NEXT: global_load_b128 v[0:3], v[8:9], off
+; GFX1250-NEXT: global_load_b128 v[4:7], v[8:9], off offset:16
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%load = load <16 x bfloat>, ptr addrspace(1) %ptr
@@ -752,12 +752,12 @@ define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v12, v0
+; GFX1250-NEXT: v_dual_mov_b32 v17, v1 :: v_dual_mov_b32 v16, v0
; GFX1250-NEXT: s_clause 0x3
-; GFX1250-NEXT: global_load_b128 v[0:3], v[12:13], off
-; GFX1250-NEXT: global_load_b128 v[4:7], v[12:13], off offset:16
-; GFX1250-NEXT: global_load_b128 v[8:11], v[12:13], off offset:32
-; GFX1250-NEXT: global_load_b128 v[12:15], v[12:13], off offset:48
+; GFX1250-NEXT: global_load_b128 v[0:3], v[16:17], off
+; GFX1250-NEXT: global_load_b128 v[4:7], v[16:17], off offset:16
+; GFX1250-NEXT: global_load_b128 v[8:11], v[16:17], off offset:32
+; GFX1250-NEXT: global_load_b128 v[12:15], v[16:17], off offset:48
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%load = load <32 x bfloat>, ptr addrspace(1) %ptr
@@ -1055,16 +1055,16 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_mov_b32 v28, v0
+; GFX1250-NEXT: v_dual_mov_b32 v33, v1 :: v_dual_mov_b32 v32, v0
; GFX1250-NEXT: s_clause 0x7
-; GFX1250-NEXT: global_load_b128 v[0:3], v[28:29], off
-; GFX1250-NEXT: global_load_b128 v[4:7], v[28:29], off offset:16
-; GFX1250-NEXT: global_load_b128 v[8:11], v[28:29], off offset:32
-; GFX1250-NEXT: global_load_b128 v[12:15], v[28:29], off offset:48
-; GFX1250-NEXT: global_load_b128 v[16:19], v[28:29], off offset:64
-; GFX1250-NEXT: global_load_b128 v[20:23], v[28:29], off offset:80
-; GFX1250-NEXT: global_load_b128 v[24:27], v[28:29], off offset:96
-; GFX1250-NEXT: global_load_b128 v[28:31], v[28:29], off offset:112
+; GFX1250-NEXT: global_load_b128 v[0:3], v[32:33], off
+; GFX1250-NEXT: global_load_b128 v[4:7], v[32:33], off offset:16
+; GFX1250-NEXT: global_load_b128 v[8:11], v[32:33], off offset:32
+; GFX1250-NEXT: global_load_b128 v[12:15], v[32:33], off offset:48
+; GFX1250-NEXT: global_load_b128 v[16:19], v[32:33], off offset:64
+; GFX1250-NEXT: global_load_b128 v[20:23], v[32:33], off offset:80
+; GFX1250-NEXT: global_load_b128 v[24:27], v[32:33], off offset:96
+; GFX1250-NEXT: global_load_b128 v[28:31], v[32:33], off offset:112
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%load = load <64 x bfloat>, ptr addrspace(1) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
index 243f0ed..f8655a7 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
@@ -256,7 +256,6 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: .LBB5_3: ; %bb4
; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GCN-NEXT: s_wait_xcnt 0x0
; GCN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 63
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index ddd3b152..363a248 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -2700,142 +2700,142 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
;
; GFX1250-LABEL: amd_kernel_v32i8:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b64_e32 v[8:9], 16
; GFX1250-NEXT: v_mov_b64_e32 v[10:11], 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_lshr_b32 s16, s0, 16
-; GFX1250-NEXT: s_lshr_b32 s17, s0, 24
-; GFX1250-NEXT: s_lshr_b32 s20, s2, 16
-; GFX1250-NEXT: s_lshr_b32 s21, s2, 24
-; GFX1250-NEXT: s_lshr_b32 s14, s7, 16
-; GFX1250-NEXT: s_lshr_b32 s15, s7, 24
-; GFX1250-NEXT: s_bfe_u32 s27, s7, 0x80008
+; GFX1250-NEXT: s_lshr_b32 s16, s8, 16
+; GFX1250-NEXT: s_lshr_b32 s17, s8, 24
+; GFX1250-NEXT: s_lshr_b32 s6, s15, 16
+; GFX1250-NEXT: s_lshr_b32 s7, s15, 24
+; GFX1250-NEXT: s_bfe_u32 s27, s15, 0x80008
; GFX1250-NEXT: s_add_co_i32 s17, s17, s17
; GFX1250-NEXT: s_add_co_i32 s16, s16, s16
-; GFX1250-NEXT: s_lshr_b32 s18, s1, 16
-; GFX1250-NEXT: s_lshr_b32 s19, s1, 24
-; GFX1250-NEXT: s_lshr_b32 s22, s3, 16
-; GFX1250-NEXT: s_lshr_b32 s23, s3, 24
-; GFX1250-NEXT: s_bfe_u32 s29, s1, 0x80008
-; GFX1250-NEXT: s_bfe_u32 s30, s3, 0x80008
-; GFX1250-NEXT: s_add_co_i32 s21, s21, s21
-; GFX1250-NEXT: s_add_co_i32 s20, s20, s20
; GFX1250-NEXT: s_lshl_b32 s17, s17, 8
; GFX1250-NEXT: s_and_b32 s16, s16, 0xff
-; GFX1250-NEXT: s_add_co_i32 s7, s7, s7
-; GFX1250-NEXT: s_add_co_i32 s27, s27, s27
; GFX1250-NEXT: s_add_co_i32 s15, s15, s15
-; GFX1250-NEXT: s_add_co_i32 s14, s14, s14
-; GFX1250-NEXT: s_add_co_i32 s3, s3, s3
+; GFX1250-NEXT: s_add_co_i32 s27, s27, s27
+; GFX1250-NEXT: s_add_co_i32 s7, s7, s7
+; GFX1250-NEXT: s_add_co_i32 s6, s6, s6
+; GFX1250-NEXT: s_or_b32 s16, s16, s17
+; GFX1250-NEXT: s_and_b32 s15, s15, 0xff
+; GFX1250-NEXT: s_lshl_b32 s17, s27, 8
+; GFX1250-NEXT: s_lshl_b32 s7, s7, 8
+; GFX1250-NEXT: s_and_b32 s6, s6, 0xff
+; GFX1250-NEXT: s_or_b32 s15, s15, s17
+; GFX1250-NEXT: s_or_b32 s6, s6, s7
+; GFX1250-NEXT: s_bfe_u32 s26, s14, 0x80008
+; GFX1250-NEXT: s_and_b32 s7, s15, 0xffff
+; GFX1250-NEXT: s_lshl_b32 s6, s6, 16
+; GFX1250-NEXT: s_lshr_b32 s20, s10, 16
+; GFX1250-NEXT: s_lshr_b32 s21, s10, 24
+; GFX1250-NEXT: s_lshr_b32 s4, s14, 16
+; GFX1250-NEXT: s_lshr_b32 s5, s14, 24
+; GFX1250-NEXT: s_or_b32 s6, s7, s6
+; GFX1250-NEXT: s_add_co_i32 s7, s14, s14
+; GFX1250-NEXT: s_add_co_i32 s26, s26, s26
+; GFX1250-NEXT: s_lshr_b32 s18, s9, 16
+; GFX1250-NEXT: s_lshr_b32 s19, s9, 24
+; GFX1250-NEXT: s_lshr_b32 s22, s11, 16
+; GFX1250-NEXT: s_lshr_b32 s23, s11, 24
+; GFX1250-NEXT: s_bfe_u32 s29, s9, 0x80008
+; GFX1250-NEXT: s_bfe_u32 s30, s11, 0x80008
+; GFX1250-NEXT: s_add_co_i32 s21, s21, s21
+; GFX1250-NEXT: s_add_co_i32 s20, s20, s20
+; GFX1250-NEXT: s_lshr_b32 s2, s13, 16
+; GFX1250-NEXT: s_lshr_b32 s3, s13, 24
+; GFX1250-NEXT: s_and_b32 s7, s7, 0xff
+; GFX1250-NEXT: s_lshl_b32 s14, s26, 8
+; GFX1250-NEXT: s_add_co_i32 s5, s5, s5
+; GFX1250-NEXT: s_add_co_i32 s4, s4, s4
+; GFX1250-NEXT: s_add_co_i32 s11, s11, s11
; GFX1250-NEXT: s_add_co_i32 s30, s30, s30
; GFX1250-NEXT: s_add_co_i32 s23, s23, s23
; GFX1250-NEXT: s_add_co_i32 s22, s22, s22
; GFX1250-NEXT: s_lshl_b32 s21, s21, 8
; GFX1250-NEXT: s_and_b32 s20, s20, 0xff
-; GFX1250-NEXT: s_add_co_i32 s1, s1, s1
+; GFX1250-NEXT: s_add_co_i32 s9, s9, s9
; GFX1250-NEXT: s_add_co_i32 s29, s29, s29
; GFX1250-NEXT: s_add_co_i32 s19, s19, s19
; GFX1250-NEXT: s_add_co_i32 s18, s18, s18
-; GFX1250-NEXT: s_lshr_b32 s10, s5, 16
-; GFX1250-NEXT: s_lshr_b32 s11, s5, 24
-; GFX1250-NEXT: s_lshr_b32 s12, s6, 16
-; GFX1250-NEXT: s_lshr_b32 s13, s6, 24
-; GFX1250-NEXT: s_or_b32 s16, s16, s17
-; GFX1250-NEXT: s_and_b32 s7, s7, 0xff
-; GFX1250-NEXT: s_lshl_b32 s17, s27, 8
-; GFX1250-NEXT: s_lshl_b32 s15, s15, 8
-; GFX1250-NEXT: s_and_b32 s14, s14, 0xff
-; GFX1250-NEXT: s_and_b32 s3, s3, 0xff
+; GFX1250-NEXT: s_bfe_u32 s25, s13, 0x80008
+; GFX1250-NEXT: s_lshl_b32 s5, s5, 8
+; GFX1250-NEXT: s_and_b32 s4, s4, 0xff
+; GFX1250-NEXT: s_or_b32 s7, s7, s14
+; GFX1250-NEXT: s_add_co_i32 s3, s3, s3
+; GFX1250-NEXT: s_add_co_i32 s2, s2, s2
+; GFX1250-NEXT: s_and_b32 s11, s11, 0xff
; GFX1250-NEXT: s_lshl_b32 s30, s30, 8
; GFX1250-NEXT: s_lshl_b32 s23, s23, 8
; GFX1250-NEXT: s_and_b32 s22, s22, 0xff
; GFX1250-NEXT: s_or_b32 s20, s20, s21
-; GFX1250-NEXT: s_and_b32 s1, s1, 0xff
+; GFX1250-NEXT: s_and_b32 s9, s9, 0xff
; GFX1250-NEXT: s_lshl_b32 s21, s29, 8
; GFX1250-NEXT: s_lshl_b32 s19, s19, 8
; GFX1250-NEXT: s_and_b32 s18, s18, 0xff
-; GFX1250-NEXT: s_lshr_b32 s8, s4, 16
-; GFX1250-NEXT: s_lshr_b32 s9, s4, 24
-; GFX1250-NEXT: s_bfe_u32 s24, s4, 0x80008
-; GFX1250-NEXT: s_bfe_u32 s25, s5, 0x80008
-; GFX1250-NEXT: s_bfe_u32 s26, s6, 0x80008
-; GFX1250-NEXT: s_or_b32 s7, s7, s17
-; GFX1250-NEXT: s_or_b32 s14, s14, s15
-; GFX1250-NEXT: s_add_co_i32 s13, s13, s13
-; GFX1250-NEXT: s_add_co_i32 s12, s12, s12
-; GFX1250-NEXT: s_add_co_i32 s11, s11, s11
-; GFX1250-NEXT: s_add_co_i32 s10, s10, s10
-; GFX1250-NEXT: s_bfe_u32 s28, s0, 0x80008
-; GFX1250-NEXT: s_or_b32 s3, s3, s30
+; GFX1250-NEXT: s_lshr_b32 s0, s12, 16
+; GFX1250-NEXT: s_lshr_b32 s1, s12, 24
+; GFX1250-NEXT: s_bfe_u32 s24, s12, 0x80008
+; GFX1250-NEXT: s_or_b32 s4, s4, s5
+; GFX1250-NEXT: s_and_b32 s5, s7, 0xffff
+; GFX1250-NEXT: s_add_co_i32 s7, s13, s13
+; GFX1250-NEXT: s_add_co_i32 s25, s25, s25
+; GFX1250-NEXT: s_lshl_b32 s3, s3, 8
+; GFX1250-NEXT: s_and_b32 s2, s2, 0xff
+; GFX1250-NEXT: s_bfe_u32 s28, s8, 0x80008
+; GFX1250-NEXT: s_or_b32 s11, s11, s30
; GFX1250-NEXT: s_or_b32 s22, s22, s23
-; GFX1250-NEXT: s_bfe_u32 s23, s2, 0x80008
-; GFX1250-NEXT: s_or_b32 s1, s1, s21
+; GFX1250-NEXT: s_bfe_u32 s23, s10, 0x80008
+; GFX1250-NEXT: s_or_b32 s9, s9, s21
; GFX1250-NEXT: s_or_b32 s18, s18, s19
-; GFX1250-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX1250-NEXT: s_lshl_b32 s14, s14, 16
-; GFX1250-NEXT: s_add_co_i32 s6, s6, s6
-; GFX1250-NEXT: s_add_co_i32 s26, s26, s26
-; GFX1250-NEXT: s_lshl_b32 s13, s13, 8
-; GFX1250-NEXT: s_and_b32 s12, s12, 0xff
-; GFX1250-NEXT: s_add_co_i32 s5, s5, s5
-; GFX1250-NEXT: s_add_co_i32 s25, s25, s25
-; GFX1250-NEXT: s_lshl_b32 s11, s11, 8
-; GFX1250-NEXT: s_and_b32 s10, s10, 0xff
-; GFX1250-NEXT: s_add_co_i32 s4, s4, s4
+; GFX1250-NEXT: s_lshl_b32 s4, s4, 16
+; GFX1250-NEXT: s_and_b32 s7, s7, 0xff
+; GFX1250-NEXT: s_lshl_b32 s13, s25, 8
+; GFX1250-NEXT: s_or_b32 s2, s2, s3
+; GFX1250-NEXT: s_add_co_i32 s3, s12, s12
; GFX1250-NEXT: s_add_co_i32 s24, s24, s24
-; GFX1250-NEXT: s_add_co_i32 s9, s9, s9
-; GFX1250-NEXT: s_add_co_i32 s8, s8, s8
-; GFX1250-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX1250-NEXT: s_add_co_i32 s1, s1, s1
+; GFX1250-NEXT: s_add_co_i32 s0, s0, s0
+; GFX1250-NEXT: s_and_b32 s11, s11, 0xffff
; GFX1250-NEXT: s_lshl_b32 s22, s22, 16
-; GFX1250-NEXT: s_add_co_i32 s2, s2, s2
+; GFX1250-NEXT: s_add_co_i32 s10, s10, s10
; GFX1250-NEXT: s_add_co_i32 s23, s23, s23
-; GFX1250-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX1250-NEXT: s_and_b32 s9, s9, 0xffff
; GFX1250-NEXT: s_lshl_b32 s18, s18, 16
-; GFX1250-NEXT: s_add_co_i32 s0, s0, s0
+; GFX1250-NEXT: s_add_co_i32 s8, s8, s8
; GFX1250-NEXT: s_add_co_i32 s28, s28, s28
-; GFX1250-NEXT: s_or_b32 s7, s7, s14
-; GFX1250-NEXT: s_and_b32 s6, s6, 0xff
-; GFX1250-NEXT: s_lshl_b32 s14, s26, 8
-; GFX1250-NEXT: s_or_b32 s12, s12, s13
-; GFX1250-NEXT: s_and_b32 s5, s5, 0xff
-; GFX1250-NEXT: s_lshl_b32 s13, s25, 8
-; GFX1250-NEXT: s_or_b32 s10, s10, s11
-; GFX1250-NEXT: s_and_b32 s4, s4, 0xff
-; GFX1250-NEXT: s_lshl_b32 s11, s24, 8
-; GFX1250-NEXT: s_lshl_b32 s9, s9, 8
-; GFX1250-NEXT: s_and_b32 s8, s8, 0xff
-; GFX1250-NEXT: s_or_b32 s3, s3, s22
-; GFX1250-NEXT: s_and_b32 s2, s2, 0xff
-; GFX1250-NEXT: s_lshl_b32 s22, s23, 8
-; GFX1250-NEXT: s_or_b32 s1, s1, s18
+; GFX1250-NEXT: s_or_b32 s4, s5, s4
+; GFX1250-NEXT: s_or_b32 s5, s7, s13
+; GFX1250-NEXT: s_and_b32 s3, s3, 0xff
+; GFX1250-NEXT: s_lshl_b32 s7, s24, 8
+; GFX1250-NEXT: s_lshl_b32 s1, s1, 8
; GFX1250-NEXT: s_and_b32 s0, s0, 0xff
+; GFX1250-NEXT: s_or_b32 s11, s11, s22
+; GFX1250-NEXT: s_and_b32 s10, s10, 0xff
+; GFX1250-NEXT: s_lshl_b32 s22, s23, 8
+; GFX1250-NEXT: s_or_b32 s9, s9, s18
+; GFX1250-NEXT: s_and_b32 s8, s8, 0xff
; GFX1250-NEXT: s_lshl_b32 s18, s28, 8
-; GFX1250-NEXT: s_or_b32 s6, s6, s14
-; GFX1250-NEXT: s_or_b32 s5, s5, s13
-; GFX1250-NEXT: s_or_b32 s4, s4, s11
-; GFX1250-NEXT: s_or_b32 s8, s8, s9
-; GFX1250-NEXT: s_or_b32 s2, s2, s22
-; GFX1250-NEXT: s_or_b32 s0, s0, s18
-; GFX1250-NEXT: s_and_b32 s6, s6, 0xffff
-; GFX1250-NEXT: s_lshl_b32 s12, s12, 16
+; GFX1250-NEXT: s_or_b32 s3, s3, s7
+; GFX1250-NEXT: s_or_b32 s0, s0, s1
+; GFX1250-NEXT: s_or_b32 s10, s10, s22
+; GFX1250-NEXT: s_or_b32 s8, s8, s18
; GFX1250-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX1250-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX1250-NEXT: s_lshl_b32 s8, s8, 16
-; GFX1250-NEXT: s_lshl_b32 s9, s10, 16
-; GFX1250-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX1250-NEXT: s_and_b32 s1, s3, 0xffff
+; GFX1250-NEXT: s_lshl_b32 s0, s0, 16
+; GFX1250-NEXT: s_lshl_b32 s2, s2, 16
+; GFX1250-NEXT: s_and_b32 s10, s10, 0xffff
; GFX1250-NEXT: s_lshl_b32 s20, s20, 16
-; GFX1250-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX1250-NEXT: s_and_b32 s8, s8, 0xffff
; GFX1250-NEXT: s_lshl_b32 s16, s16, 16
-; GFX1250-NEXT: s_or_b32 s6, s6, s12
-; GFX1250-NEXT: s_or_b32 s4, s4, s8
-; GFX1250-NEXT: s_or_b32 s5, s5, s9
-; GFX1250-NEXT: s_or_b32 s2, s2, s20
-; GFX1250-NEXT: s_or_b32 s0, s0, s16
-; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1250-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
-; GFX1250-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
-; GFX1250-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX1250-NEXT: s_or_b32 s0, s1, s0
+; GFX1250-NEXT: s_or_b32 s1, s5, s2
+; GFX1250-NEXT: s_or_b32 s10, s10, s20
+; GFX1250-NEXT: s_or_b32 s8, s8, s16
+; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1250-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s6
+; GFX1250-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9
+; GFX1250-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[8:9], v[0:3], off
; GFX1250-NEXT: global_store_b128 v[10:11], v[4:7], off
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 51652a0..2ae6fc2 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -117,12 +117,12 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
;
; GFX1250-LABEL: sadd64rr:
; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[6:7]
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -818,17 +818,17 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
;
; GFX1250-LABEL: suaddo64:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_add_nc_u64 s[6:7], s[4:5], s[6:7]
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
-; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[12:13], s[14:15]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[12:13]
+; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX1250-NEXT: s_clause 0x1
-; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX1250-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[8:9]
+; GFX1250-NEXT: global_store_b8 v2, v3, s[10:11]
; GFX1250-NEXT: s_endpgm
%uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %uadd, 0
@@ -1096,12 +1096,12 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
;
; GFX1250-LABEL: ssub64rr:
; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_sub_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX1250-NEXT: s_sub_nc_u64 s[2:3], s[2:3], s[6:7]
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -1798,17 +1798,17 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
;
; GFX1250-LABEL: susubo64:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_sub_nc_u64 s[6:7], s[4:5], s[6:7]
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
-; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX1250-NEXT: s_sub_nc_u64 s[0:1], s[12:13], s[14:15]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], s[12:13]
+; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX1250-NEXT: s_clause 0x1
-; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX1250-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[8:9]
+; GFX1250-NEXT: global_store_b8 v2, v3, s[10:11]
; GFX1250-NEXT: s_endpgm
%usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %usub, 0
@@ -3099,70 +3099,70 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
;
; GFX1250-LABEL: sudiv64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5]
+; GFX1250-NEXT: s_or_b64 s[4:5], s[2:3], s[6:7]
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT: s_and_b64 s[6:7], s[6:7], 0xffffffff00000000
-; GFX1250-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX1250-NEXT: s_and_b64 s[4:5], s[4:5], 0xffffffff00000000
+; GFX1250-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX1250-NEXT: s_cbranch_scc0 .LBB16_4
; GFX1250-NEXT: ; %bb.1:
-; GFX1250-NEXT: s_cvt_f32_u32 s6, s4
-; GFX1250-NEXT: s_cvt_f32_u32 s7, s5
-; GFX1250-NEXT: s_sub_nc_u64 s[10:11], 0, s[4:5]
+; GFX1250-NEXT: s_cvt_f32_u32 s4, s6
+; GFX1250-NEXT: s_cvt_f32_u32 s5, s7
+; GFX1250-NEXT: s_sub_nc_u64 s[10:11], 0, s[6:7]
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_3)
-; GFX1250-NEXT: s_fmac_f32 s6, s7, 0x4f800000
-; GFX1250-NEXT: v_s_rcp_f32 s6, s6
+; GFX1250-NEXT: s_fmac_f32 s4, s5, 0x4f800000
+; GFX1250-NEXT: v_s_rcp_f32 s4, s4
; GFX1250-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
-; GFX1250-NEXT: s_mul_f32 s6, s6, 0x5f7ffffc
-; GFX1250-NEXT: s_mul_f32 s7, s6, 0x2f800000
+; GFX1250-NEXT: s_mul_f32 s4, s4, 0x5f7ffffc
+; GFX1250-NEXT: s_mul_f32 s5, s4, 0x2f800000
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
-; GFX1250-NEXT: s_trunc_f32 s7, s7
-; GFX1250-NEXT: s_fmac_f32 s6, s7, 0xcf800000
-; GFX1250-NEXT: s_cvt_u32_f32 s9, s7
-; GFX1250-NEXT: s_mov_b32 s7, 0
+; GFX1250-NEXT: s_trunc_f32 s5, s5
+; GFX1250-NEXT: s_fmac_f32 s4, s5, 0xcf800000
+; GFX1250-NEXT: s_cvt_u32_f32 s9, s5
+; GFX1250-NEXT: s_mov_b32 s5, 0
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
-; GFX1250-NEXT: s_cvt_u32_f32 s8, s6
+; GFX1250-NEXT: s_cvt_u32_f32 s8, s4
; GFX1250-NEXT: s_mul_u64 s[12:13], s[10:11], s[8:9]
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_mul_hi_u32 s15, s8, s13
; GFX1250-NEXT: s_mul_i32 s14, s8, s13
-; GFX1250-NEXT: s_mul_hi_u32 s6, s8, s12
+; GFX1250-NEXT: s_mul_hi_u32 s4, s8, s12
; GFX1250-NEXT: s_mul_i32 s17, s9, s12
-; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[6:7], s[14:15]
+; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[4:5], s[14:15]
; GFX1250-NEXT: s_mul_hi_u32 s16, s9, s12
; GFX1250-NEXT: s_mul_hi_u32 s18, s9, s13
-; GFX1250-NEXT: s_add_co_u32 s6, s14, s17
-; GFX1250-NEXT: s_add_co_ci_u32 s6, s15, s16
+; GFX1250-NEXT: s_add_co_u32 s4, s14, s17
+; GFX1250-NEXT: s_add_co_ci_u32 s4, s15, s16
; GFX1250-NEXT: s_mul_i32 s12, s9, s13
; GFX1250-NEXT: s_add_co_ci_u32 s13, s18, 0
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[6:7], s[12:13]
+; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[4:5], s[12:13]
; GFX1250-NEXT: s_add_co_u32 s8, s8, s12
-; GFX1250-NEXT: s_cselect_b32 s6, -1, 0
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1250-NEXT: s_cmp_lg_u32 s4, 0
; GFX1250-NEXT: s_add_co_ci_u32 s9, s9, s13
; GFX1250-NEXT: s_mul_u64 s[10:11], s[10:11], s[8:9]
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_mul_hi_u32 s13, s8, s11
; GFX1250-NEXT: s_mul_i32 s12, s8, s11
-; GFX1250-NEXT: s_mul_hi_u32 s6, s8, s10
+; GFX1250-NEXT: s_mul_hi_u32 s4, s8, s10
; GFX1250-NEXT: s_mul_i32 s15, s9, s10
-; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[6:7], s[12:13]
+; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[4:5], s[12:13]
; GFX1250-NEXT: s_mul_hi_u32 s14, s9, s10
; GFX1250-NEXT: s_mul_hi_u32 s16, s9, s11
-; GFX1250-NEXT: s_add_co_u32 s6, s12, s15
-; GFX1250-NEXT: s_add_co_ci_u32 s6, s13, s14
+; GFX1250-NEXT: s_add_co_u32 s4, s12, s15
+; GFX1250-NEXT: s_add_co_ci_u32 s4, s13, s14
; GFX1250-NEXT: s_mul_i32 s10, s9, s11
; GFX1250-NEXT: s_add_co_ci_u32 s11, s16, 0
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT: s_add_nc_u64 s[10:11], s[6:7], s[10:11]
+; GFX1250-NEXT: s_add_nc_u64 s[10:11], s[4:5], s[10:11]
; GFX1250-NEXT: s_add_co_u32 s8, s8, s10
; GFX1250-NEXT: s_cselect_b32 s10, -1, 0
-; GFX1250-NEXT: s_mul_hi_u32 s6, s2, s8
+; GFX1250-NEXT: s_mul_hi_u32 s4, s2, s8
; GFX1250-NEXT: s_cmp_lg_u32 s10, 0
; GFX1250-NEXT: s_mul_hi_u32 s12, s3, s8
; GFX1250-NEXT: s_add_co_ci_u32 s10, s9, s11
@@ -3170,33 +3170,33 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1250-NEXT: s_mul_hi_u32 s9, s2, s10
; GFX1250-NEXT: s_mul_i32 s8, s2, s10
; GFX1250-NEXT: s_mul_hi_u32 s13, s3, s10
-; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[6:7], s[8:9]
+; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[8:9]
; GFX1250-NEXT: s_mul_i32 s10, s3, s10
-; GFX1250-NEXT: s_add_co_u32 s6, s8, s11
-; GFX1250-NEXT: s_add_co_ci_u32 s6, s9, s12
+; GFX1250-NEXT: s_add_co_u32 s4, s8, s11
+; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s12
; GFX1250-NEXT: s_add_co_ci_u32 s11, s13, 0
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[6:7], s[10:11]
+; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[10:11]
; GFX1250-NEXT: s_and_b64 s[10:11], s[8:9], 0xffffffff00000000
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_or_b32 s10, s10, s8
-; GFX1250-NEXT: s_mul_u64 s[8:9], s[4:5], s[10:11]
+; GFX1250-NEXT: s_mul_u64 s[8:9], s[6:7], s[10:11]
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT: s_sub_co_u32 s6, s2, s8
+; GFX1250-NEXT: s_sub_co_u32 s4, s2, s8
; GFX1250-NEXT: s_cselect_b32 s8, -1, 0
; GFX1250-NEXT: s_sub_co_i32 s12, s3, s9
; GFX1250-NEXT: s_cmp_lg_u32 s8, 0
-; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, s5
-; GFX1250-NEXT: s_sub_co_u32 s13, s6, s4
+; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, s7
+; GFX1250-NEXT: s_sub_co_u32 s13, s4, s6
; GFX1250-NEXT: s_cselect_b32 s14, -1, 0
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_cmp_lg_u32 s14, 0
; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, 0
-; GFX1250-NEXT: s_cmp_ge_u32 s12, s5
+; GFX1250-NEXT: s_cmp_ge_u32 s12, s7
; GFX1250-NEXT: s_cselect_b32 s14, -1, 0
-; GFX1250-NEXT: s_cmp_ge_u32 s13, s4
+; GFX1250-NEXT: s_cmp_ge_u32 s13, s6
; GFX1250-NEXT: s_cselect_b32 s15, -1, 0
-; GFX1250-NEXT: s_cmp_eq_u32 s12, s5
+; GFX1250-NEXT: s_cmp_eq_u32 s12, s7
; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[10:11], 1
; GFX1250-NEXT: s_cselect_b32 s16, s15, s14
; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[10:11], 2
@@ -3206,20 +3206,20 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1250-NEXT: s_cmp_lg_u32 s8, 0
; GFX1250-NEXT: s_sub_co_ci_u32 s3, s3, s9
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT: s_cmp_ge_u32 s3, s5
+; GFX1250-NEXT: s_cmp_ge_u32 s3, s7
; GFX1250-NEXT: s_cselect_b32 s8, -1, 0
-; GFX1250-NEXT: s_cmp_ge_u32 s6, s4
-; GFX1250-NEXT: s_cselect_b32 s6, -1, 0
-; GFX1250-NEXT: s_cmp_eq_u32 s3, s5
-; GFX1250-NEXT: s_cselect_b32 s3, s6, s8
+; GFX1250-NEXT: s_cmp_ge_u32 s4, s6
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s3, s7
+; GFX1250-NEXT: s_cselect_b32 s3, s4, s8
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_cmp_lg_u32 s3, 0
; GFX1250-NEXT: s_cselect_b32 s9, s13, s11
; GFX1250-NEXT: s_cselect_b32 s8, s12, s10
; GFX1250-NEXT: s_cbranch_execnz .LBB16_3
; GFX1250-NEXT: .LBB16_2:
-; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, s4
-; GFX1250-NEXT: s_sub_co_i32 s5, 0, s4
+; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, s6
+; GFX1250-NEXT: s_sub_co_i32 s4, 0, s6
; GFX1250-NEXT: s_mov_b32 s9, 0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
; GFX1250-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -3228,23 +3228,23 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX1250-NEXT: v_readfirstlane_b32 s3, v0
-; GFX1250-NEXT: s_mul_i32 s5, s5, s3
+; GFX1250-NEXT: s_mul_i32 s4, s4, s3
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT: s_mul_hi_u32 s5, s3, s5
-; GFX1250-NEXT: s_add_co_i32 s3, s3, s5
+; GFX1250-NEXT: s_mul_hi_u32 s4, s3, s4
+; GFX1250-NEXT: s_add_co_i32 s3, s3, s4
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_mul_hi_u32 s3, s2, s3
-; GFX1250-NEXT: s_mul_i32 s5, s3, s4
+; GFX1250-NEXT: s_mul_i32 s4, s3, s6
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT: s_sub_co_i32 s2, s2, s5
-; GFX1250-NEXT: s_add_co_i32 s5, s3, 1
-; GFX1250-NEXT: s_sub_co_i32 s6, s2, s4
-; GFX1250-NEXT: s_cmp_ge_u32 s2, s4
-; GFX1250-NEXT: s_cselect_b32 s3, s5, s3
-; GFX1250-NEXT: s_cselect_b32 s2, s6, s2
-; GFX1250-NEXT: s_add_co_i32 s5, s3, 1
-; GFX1250-NEXT: s_cmp_ge_u32 s2, s4
-; GFX1250-NEXT: s_cselect_b32 s8, s5, s3
+; GFX1250-NEXT: s_sub_co_i32 s2, s2, s4
+; GFX1250-NEXT: s_add_co_i32 s4, s3, 1
+; GFX1250-NEXT: s_sub_co_i32 s5, s2, s6
+; GFX1250-NEXT: s_cmp_ge_u32 s2, s6
+; GFX1250-NEXT: s_cselect_b32 s3, s4, s3
+; GFX1250-NEXT: s_cselect_b32 s2, s5, s2
+; GFX1250-NEXT: s_add_co_i32 s4, s3, 1
+; GFX1250-NEXT: s_cmp_ge_u32 s2, s6
+; GFX1250-NEXT: s_cselect_b32 s8, s4, s3
; GFX1250-NEXT: .LBB16_3:
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
index 029aa39..c475efb 100644
--- a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
@@ -128,13 +128,13 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]]
; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY1]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY2]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY2]]
; CHECK-NEXT: SI_RETURN
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vreg_64 = COPY $vgpr2_vgpr3
undef %2.sub0_sub1:areg_128 = COPY %0
%2.sub2_sub3:areg_128 = COPY %1
- INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2
SI_RETURN
...
@@ -153,13 +153,13 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]]
; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY1]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY2]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY2]]
; CHECK-NEXT: SI_RETURN
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vreg_64 = COPY $vgpr2_vgpr3
undef %2.sub0_sub1:areg_128_align2 = COPY %0
%2.sub2_sub3:areg_128_align2 = COPY %1
- INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2
SI_RETURN
...
@@ -398,14 +398,14 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]]
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]]
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
undef %1.sub0:areg_128 = COPY %0
%1.sub1:areg_128 = COPY %0
%1.sub2:areg_128 = COPY %0
%1.sub3:areg_128 = COPY %0
- INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %1
+ INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %1
SI_RETURN
...
@@ -425,14 +425,14 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]]
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]]
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
undef %1.sub0:areg_128_align2 = COPY %0
%1.sub1:areg_128_align2 = COPY %0
%1.sub2:areg_128_align2 = COPY %0
%1.sub3:areg_128_align2 = COPY %0
- INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %1
+ INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %1
SI_RETURN
...
@@ -504,7 +504,7 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY [[COPY]].sub0
; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, [[COPY1]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3670025 /* reguse:VS_64_with_sub1 */, [[COPY]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:VReg_64 */, [[COPY]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
undef %1.sub0:areg_64 = COPY %0
@@ -512,7 +512,7 @@ body: |
undef %2.sub0:vreg_64 = COPY %0
%2.sub1:vreg_64 = COPY %0
INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, killed %1
- INLINEASM &"; use $0", 0 /* attdialect */, 3670025 /* reguse:VReg_64 */, killed %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:VReg_64 */, killed %2
SI_RETURN
...
@@ -641,13 +641,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0_sub1:vreg_128 =COPY $vgpr0_vgpr1
%0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
%2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
- INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2
SI_RETURN
...
@@ -668,13 +668,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_128 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub1
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_128 =COPY $vgpr0_vgpr1
%0.sub1:vreg_128 = COPY $vgpr2_vgpr3
undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0
%2.sub2_sub3:areg_128_align2 = COPY %0.sub1
- INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2
SI_RETURN
...
@@ -890,14 +890,14 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]].sub0
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_64 = COPY $vgpr0
undef %1.sub0:areg_128 = COPY %0.sub0
%1.sub1:areg_128 = COPY %0.sub0
%1.sub2:areg_128 = COPY %0.sub0
%1.sub3:areg_128 = COPY %0.sub0
- INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %1
+ INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %1
SI_RETURN
...
@@ -917,14 +917,14 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]].sub0
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_64 = COPY $vgpr0
undef %1.sub0:areg_128_align2 = COPY %0.sub0
%1.sub1:areg_128_align2 = COPY %0.sub0
%1.sub2:areg_128_align2 = COPY %0.sub0
%1.sub3:areg_128_align2 = COPY %0.sub0
- INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %1
+ INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %1
SI_RETURN
...
@@ -1051,13 +1051,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1
%0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
%2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
- INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2
SI_RETURN
...
@@ -1076,13 +1076,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0_sub1
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub2_sub3
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0_sub1:vreg_128_align2 = COPY $vgpr0_vgpr1
%0.sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3
undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0_sub1
%2.sub2_sub3:areg_128_align2 = COPY %0.sub2_sub3
- INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2
SI_RETURN
...
@@ -1358,11 +1358,11 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128 = COPY [[COPY]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
%0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
%2:areg_128 = COPY %0
- INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2
SI_RETURN
...
@@ -1379,11 +1379,11 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[COPY]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
%0:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
%2:areg_128_align2 = COPY %0
- INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2
SI_RETURN
...
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index be60a00..0cae0e5 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -705,12 +705,13 @@ define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(ptr addrspace(1) %C
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_b32 v1, v0, s[0:1] scale_offset
-; GFX1250-NEXT: global_load_b32 v0, v0, s[2:3] scale_offset
-; GFX1250-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1250-NEXT: global_load_b32 v2, v0, s[2:3] scale_offset
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s5
; GFX1250-NEXT: s_wait_loadcnt 0x1
-; GFX1250-NEXT: ds_store_b32 v2, v1 offset:32
+; GFX1250-NEXT: ds_store_b32 v0, v1 offset:32
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: ds_store_b32 v3, v0 offset:32
+; GFX1250-NEXT: ds_store_b32 v3, v2 offset:32
; GFX1250-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %x.i
@@ -1282,14 +1283,14 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3)
;
; GFX1250-LABEL: simple_write2_v4f32_superreg_align4:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s8, s[4:5], 0x0
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1250-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_lshl_add_u32 v0, v0, 4, s4
+; GFX1250-NEXT: v_lshl_add_u32 v0, v0, 4, s8
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
; GFX1250-NEXT: v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v4, s1
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index 1e7855c..eefc781 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -541,11 +541,10 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -570,9 +569,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB10_2
; GFX1250-SDAG-NEXT: .LBB10_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: s_clause 0x1
@@ -586,14 +584,13 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
;
; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_rtn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -618,10 +615,9 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB10_2
; GFX1250-GISEL-NEXT: .LBB10_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: s_clause 0x1
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
@@ -727,13 +723,12 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB11_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -754,9 +749,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB11_2
; GFX1250-SDAG-NEXT: .LBB11_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: s_clause 0x1
@@ -770,8 +764,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
;
; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_rtn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
@@ -780,7 +773,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -805,10 +798,9 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX1250-GISEL-NEXT: .LBB11_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: s_clause 0x1
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
@@ -917,11 +909,10 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -943,9 +934,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB12_2
; GFX1250-SDAG-NEXT: .LBB12_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off
@@ -953,15 +943,14 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
;
; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB12_3
@@ -982,10 +971,9 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB12_2
; GFX1250-GISEL-NEXT: .LBB12_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -1069,11 +1057,9 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB13_3
@@ -1094,9 +1080,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB13_2
; GFX1250-SDAG-NEXT: .LBB13_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off
@@ -1104,8 +1089,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
;
; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1115,7 +1099,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB13_3
@@ -1136,10 +1120,9 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB13_2
; GFX1250-GISEL-NEXT: .LBB13_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -1400,11 +1383,10 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-LABEL: flat_add_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -1429,9 +1411,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB18_2
; GFX1250-SDAG-NEXT: .LBB18_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -1445,14 +1426,13 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
;
; GFX1250-GISEL-LABEL: flat_add_saddr_i64_rtn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -1477,10 +1457,9 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB18_2
; GFX1250-GISEL-NEXT: .LBB18_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -1590,13 +1569,12 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB19_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -1617,9 +1595,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB19_2
; GFX1250-SDAG-NEXT: .LBB19_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -1633,8 +1610,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
;
; GFX1250-GISEL-LABEL: flat_add_saddr_i64_rtn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
@@ -1643,7 +1619,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -1668,10 +1644,9 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB19_2
; GFX1250-GISEL-NEXT: .LBB19_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -1784,11 +1759,10 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -1810,9 +1784,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB20_2
; GFX1250-SDAG-NEXT: .LBB20_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -1823,15 +1796,14 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
;
; GFX1250-GISEL-LABEL: flat_add_saddr_i64_nortn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB20_3
@@ -1852,10 +1824,9 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB20_2
; GFX1250-GISEL-NEXT: .LBB20_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -1950,11 +1921,9 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB21_3
@@ -1975,9 +1944,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB21_2
; GFX1250-SDAG-NEXT: .LBB21_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -1988,8 +1956,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
;
; GFX1250-GISEL-LABEL: flat_add_saddr_i64_nortn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1999,7 +1966,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB21_3
@@ -2020,10 +1987,9 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB21_2
; GFX1250-GISEL-NEXT: .LBB21_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -2295,11 +2261,10 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -2324,9 +2289,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB26_2
; GFX1250-SDAG-NEXT: .LBB26_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -2340,14 +2304,13 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
;
; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_rtn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -2372,10 +2335,9 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB26_2
; GFX1250-GISEL-NEXT: .LBB26_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -2487,13 +2449,12 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB27_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -2514,9 +2475,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB27_2
; GFX1250-SDAG-NEXT: .LBB27_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -2530,8 +2490,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
;
; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_rtn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
@@ -2540,7 +2499,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -2565,10 +2524,9 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB27_2
; GFX1250-GISEL-NEXT: .LBB27_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -2683,11 +2641,10 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -2709,9 +2666,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB28_2
; GFX1250-SDAG-NEXT: .LBB28_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -2722,15 +2678,14 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
;
; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_nortn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB28_3
@@ -2751,10 +2706,9 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB28_2
; GFX1250-GISEL-NEXT: .LBB28_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -2851,11 +2805,9 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB29_3
@@ -2876,9 +2828,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB29_2
; GFX1250-SDAG-NEXT: .LBB29_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -2889,8 +2840,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
;
; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_nortn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2900,7 +2850,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB29_3
@@ -2921,10 +2871,9 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB29_2
; GFX1250-GISEL-NEXT: .LBB29_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -3198,11 +3147,10 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-LABEL: flat_and_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -3227,9 +3175,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB34_2
; GFX1250-SDAG-NEXT: .LBB34_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -3244,14 +3191,13 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
;
; GFX1250-GISEL-LABEL: flat_and_saddr_i64_rtn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -3276,10 +3222,9 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB34_2
; GFX1250-GISEL-NEXT: .LBB34_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -3390,13 +3335,12 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB35_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -3417,9 +3361,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB35_2
; GFX1250-SDAG-NEXT: .LBB35_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -3434,8 +3377,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
;
; GFX1250-GISEL-LABEL: flat_and_saddr_i64_rtn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
@@ -3444,7 +3386,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -3469,10 +3411,9 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB35_2
; GFX1250-GISEL-NEXT: .LBB35_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -3586,11 +3527,10 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -3612,9 +3552,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB36_2
; GFX1250-SDAG-NEXT: .LBB36_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -3626,15 +3565,14 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
;
; GFX1250-GISEL-LABEL: flat_and_saddr_i64_nortn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB36_3
@@ -3655,10 +3593,9 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB36_2
; GFX1250-GISEL-NEXT: .LBB36_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -3754,11 +3691,9 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB37_3
@@ -3779,9 +3714,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB37_2
; GFX1250-SDAG-NEXT: .LBB37_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -3793,8 +3727,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
;
; GFX1250-GISEL-LABEL: flat_and_saddr_i64_nortn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3804,7 +3737,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB37_3
@@ -3825,10 +3758,9 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB37_2
; GFX1250-GISEL-NEXT: .LBB37_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -4101,11 +4033,10 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
; GFX1250-SDAG-LABEL: flat_or_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -4130,9 +4061,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB42_2
; GFX1250-SDAG-NEXT: .LBB42_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -4147,14 +4077,13 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
;
; GFX1250-GISEL-LABEL: flat_or_saddr_i64_rtn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -4179,10 +4108,9 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB42_2
; GFX1250-GISEL-NEXT: .LBB42_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -4293,13 +4221,12 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB43_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -4320,9 +4247,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB43_2
; GFX1250-SDAG-NEXT: .LBB43_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -4337,8 +4263,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
;
; GFX1250-GISEL-LABEL: flat_or_saddr_i64_rtn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
@@ -4347,7 +4272,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -4372,10 +4297,9 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB43_2
; GFX1250-GISEL-NEXT: .LBB43_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -4489,11 +4413,10 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -4515,9 +4438,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB44_2
; GFX1250-SDAG-NEXT: .LBB44_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -4529,15 +4451,14 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
;
; GFX1250-GISEL-LABEL: flat_or_saddr_i64_nortn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB44_3
@@ -4558,10 +4479,9 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB44_2
; GFX1250-GISEL-NEXT: .LBB44_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -4657,11 +4577,9 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB45_3
@@ -4682,9 +4600,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB45_2
; GFX1250-SDAG-NEXT: .LBB45_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -4696,8 +4613,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
;
; GFX1250-GISEL-LABEL: flat_or_saddr_i64_nortn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4707,7 +4623,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB45_3
@@ -4728,10 +4644,9 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB45_2
; GFX1250-GISEL-NEXT: .LBB45_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -5004,11 +4919,10 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -5033,9 +4947,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB50_2
; GFX1250-SDAG-NEXT: .LBB50_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -5050,14 +4963,13 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
;
; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_rtn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -5082,10 +4994,9 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB50_2
; GFX1250-GISEL-NEXT: .LBB50_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -5196,13 +5107,12 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB51_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -5223,9 +5133,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB51_2
; GFX1250-SDAG-NEXT: .LBB51_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -5240,8 +5149,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
;
; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_rtn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
@@ -5250,7 +5158,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -5275,10 +5183,9 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB51_2
; GFX1250-GISEL-NEXT: .LBB51_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -5392,11 +5299,10 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -5418,9 +5324,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB52_2
; GFX1250-SDAG-NEXT: .LBB52_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -5432,15 +5337,14 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
;
; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_nortn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB52_3
@@ -5461,10 +5365,9 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB52_2
; GFX1250-GISEL-NEXT: .LBB52_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -5560,11 +5463,9 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB53_3
@@ -5585,9 +5486,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB53_2
; GFX1250-SDAG-NEXT: .LBB53_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -5599,8 +5499,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
;
; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_nortn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5610,7 +5509,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB53_3
@@ -5631,10 +5530,9 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB53_2
; GFX1250-GISEL-NEXT: .LBB53_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -5877,11 +5775,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-LABEL: flat_max_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -5902,9 +5799,8 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB58_2
; GFX1250-SDAG-NEXT: .LBB58_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -5918,14 +5814,13 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
;
; GFX1250-GISEL-LABEL: flat_max_saddr_i64_rtn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -5946,10 +5841,9 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB58_2
; GFX1250-GISEL-NEXT: .LBB58_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -6061,13 +5955,12 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB59_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -6084,9 +5977,8 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB59_2
; GFX1250-SDAG-NEXT: .LBB59_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -6100,8 +5992,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
;
; GFX1250-GISEL-LABEL: flat_max_saddr_i64_rtn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
@@ -6110,7 +6001,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -6131,10 +6022,9 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB59_2
; GFX1250-GISEL-NEXT: .LBB59_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -6249,11 +6139,10 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -6272,9 +6161,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB60_2
; GFX1250-SDAG-NEXT: .LBB60_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -6285,15 +6173,14 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
;
; GFX1250-GISEL-LABEL: flat_max_saddr_i64_nortn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB60_3
@@ -6311,10 +6198,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB60_2
; GFX1250-GISEL-NEXT: .LBB60_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -6409,11 +6295,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB61_3
@@ -6431,9 +6315,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB61_2
; GFX1250-SDAG-NEXT: .LBB61_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -6444,8 +6327,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
;
; GFX1250-GISEL-LABEL: flat_max_saddr_i64_nortn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -6455,7 +6337,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB61_3
@@ -6473,10 +6355,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB61_2
; GFX1250-GISEL-NEXT: .LBB61_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -6718,11 +6599,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-LABEL: flat_min_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -6743,9 +6623,8 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB66_2
; GFX1250-SDAG-NEXT: .LBB66_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -6759,14 +6638,13 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
;
; GFX1250-GISEL-LABEL: flat_min_saddr_i64_rtn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -6787,10 +6665,9 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB66_2
; GFX1250-GISEL-NEXT: .LBB66_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -6902,13 +6779,12 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB67_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -6925,9 +6801,8 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB67_2
; GFX1250-SDAG-NEXT: .LBB67_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -6941,8 +6816,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
;
; GFX1250-GISEL-LABEL: flat_min_saddr_i64_rtn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
@@ -6951,7 +6825,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -6972,10 +6846,9 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB67_2
; GFX1250-GISEL-NEXT: .LBB67_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -7090,11 +6963,10 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -7113,9 +6985,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB68_2
; GFX1250-SDAG-NEXT: .LBB68_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -7126,15 +6997,14 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
;
; GFX1250-GISEL-LABEL: flat_min_saddr_i64_nortn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB68_3
@@ -7152,10 +7022,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB68_2
; GFX1250-GISEL-NEXT: .LBB68_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -7250,11 +7119,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB69_3
@@ -7272,9 +7139,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB69_2
; GFX1250-SDAG-NEXT: .LBB69_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -7285,8 +7151,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
;
; GFX1250-GISEL-LABEL: flat_min_saddr_i64_nortn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -7296,7 +7161,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB69_3
@@ -7314,10 +7179,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB69_2
; GFX1250-GISEL-NEXT: .LBB69_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -7559,11 +7423,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -7584,9 +7447,8 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB74_2
; GFX1250-SDAG-NEXT: .LBB74_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -7600,14 +7462,13 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
;
; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_rtn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -7628,10 +7489,9 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB74_2
; GFX1250-GISEL-NEXT: .LBB74_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -7743,13 +7603,12 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB75_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -7766,9 +7625,8 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB75_2
; GFX1250-SDAG-NEXT: .LBB75_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -7782,8 +7640,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
;
; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_rtn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
@@ -7792,7 +7649,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -7813,10 +7670,9 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB75_2
; GFX1250-GISEL-NEXT: .LBB75_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -7931,11 +7787,10 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -7954,9 +7809,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB76_2
; GFX1250-SDAG-NEXT: .LBB76_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -7967,15 +7821,14 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
;
; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_nortn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB76_3
@@ -7993,10 +7846,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB76_2
; GFX1250-GISEL-NEXT: .LBB76_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -8091,11 +7943,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB77_3
@@ -8113,9 +7963,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB77_2
; GFX1250-SDAG-NEXT: .LBB77_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -8126,8 +7975,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
;
; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_nortn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -8137,7 +7985,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB77_3
@@ -8155,10 +8003,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB77_2
; GFX1250-GISEL-NEXT: .LBB77_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -8400,11 +8247,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -8425,9 +8271,8 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB82_2
; GFX1250-SDAG-NEXT: .LBB82_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -8441,14 +8286,13 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
;
; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_rtn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -8469,10 +8313,9 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB82_2
; GFX1250-GISEL-NEXT: .LBB82_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -8584,13 +8427,12 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB83_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -8607,9 +8449,8 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB83_2
; GFX1250-SDAG-NEXT: .LBB83_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -8623,8 +8464,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
;
; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_rtn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
@@ -8633,7 +8473,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -8654,10 +8494,9 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB83_2
; GFX1250-GISEL-NEXT: .LBB83_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -8772,11 +8611,10 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -8795,9 +8633,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB84_2
; GFX1250-SDAG-NEXT: .LBB84_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -8808,15 +8645,14 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
;
; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_nortn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB84_3
@@ -8834,10 +8670,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB84_2
; GFX1250-GISEL-NEXT: .LBB84_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -8932,11 +8767,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB85_3
@@ -8954,9 +8787,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB85_2
; GFX1250-SDAG-NEXT: .LBB85_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -8967,8 +8799,7 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
;
; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_nortn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -8978,7 +8809,7 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB85_3
@@ -8996,10 +8827,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB85_2
; GFX1250-GISEL-NEXT: .LBB85_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -9281,12 +9111,11 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v4, v3
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v3
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v3
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -9311,9 +9140,8 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB90_2
; GFX1250-SDAG-NEXT: .LBB90_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v2
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v8, off
@@ -9328,15 +9156,14 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
;
; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v10, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v0, v5
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v3, v10
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v3 bitop3:0x14
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -9361,10 +9188,9 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB90_2
; GFX1250-GISEL-NEXT: .LBB90_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -9485,13 +9311,12 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v3
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v3
; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB91_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -9512,9 +9337,8 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB91_2
; GFX1250-SDAG-NEXT: .LBB91_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v2
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v8, off
@@ -9529,10 +9353,9 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
;
; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v10, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -9540,7 +9363,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v3, v10
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v3 bitop3:0x14
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -9565,10 +9388,9 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB91_2
; GFX1250-GISEL-NEXT: .LBB91_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -9690,13 +9512,12 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
-; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v4, v3
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, s0, v1
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, src_flat_scratch_base_hi, v1
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v2
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -9718,9 +9539,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB92_2
; GFX1250-SDAG-NEXT: .LBB92_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, s0, v0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, src_flat_scratch_base_lo, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v2, off
@@ -9732,15 +9552,15 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
;
; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, src_flat_scratch_base_hi :: v_dual_mov_b32 v8, v1
; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v1, v3, v5 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB92_3
@@ -9761,10 +9581,9 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB92_2
; GFX1250-GISEL-NEXT: .LBB92_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -9869,11 +9688,9 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, src_flat_scratch_base_hi, v1
; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v2
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB93_3
@@ -9894,9 +9711,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB93_2
; GFX1250-SDAG-NEXT: .LBB93_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, s0, v0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, src_flat_scratch_base_lo, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v2, off
@@ -9908,18 +9724,18 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
;
; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, src_flat_scratch_base_hi :: v_dual_mov_b32 v8, v1
; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v1, v3, v5 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB93_3
@@ -9940,10 +9756,9 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB93_2
; GFX1250-GISEL-NEXT: .LBB93_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -10188,11 +10003,10 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -10214,10 +10028,9 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB98_2
; GFX1250-SDAG-NEXT: .LBB98_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
@@ -10233,14 +10046,13 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
;
; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_rtn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -10262,18 +10074,17 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB98_2
; GFX1250-GISEL-NEXT: .LBB98_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
@@ -10386,13 +10197,12 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB99_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -10410,10 +10220,9 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB99_2
; GFX1250-SDAG-NEXT: .LBB99_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
@@ -10429,8 +10238,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
;
; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_rtn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
@@ -10439,7 +10247,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -10461,18 +10269,17 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB99_2
; GFX1250-GISEL-NEXT: .LBB99_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
@@ -10588,11 +10395,10 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -10610,9 +10416,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB100_2
; GFX1250-SDAG-NEXT: .LBB100_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
@@ -10625,15 +10430,14 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
;
; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_nortn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB100_3
@@ -10650,17 +10454,16 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB100_2
; GFX1250-GISEL-NEXT: .LBB100_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -10754,11 +10557,9 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB101_3
@@ -10775,9 +10576,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB101_2
; GFX1250-SDAG-NEXT: .LBB101_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
@@ -10790,8 +10590,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
;
; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_nortn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -10801,7 +10600,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB101_3
@@ -10818,17 +10617,16 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB101_2
; GFX1250-GISEL-NEXT: .LBB101_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -11064,11 +10862,10 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -11090,10 +10887,9 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB106_2
; GFX1250-SDAG-NEXT: .LBB106_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s0, v4
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
@@ -11112,14 +10908,13 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
;
; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_rtn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -11141,11 +10936,10 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB106_2
; GFX1250-GISEL-NEXT: .LBB106_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -11268,13 +11062,12 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB107_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -11292,10 +11085,9 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB107_2
; GFX1250-SDAG-NEXT: .LBB107_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s0, v4
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
@@ -11314,8 +11106,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
;
; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_rtn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
@@ -11324,7 +11115,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -11346,11 +11137,10 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB107_2
; GFX1250-GISEL-NEXT: .LBB107_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -11476,11 +11266,10 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -11498,9 +11287,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB108_2
; GFX1250-SDAG-NEXT: .LBB108_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -11516,15 +11304,14 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
;
; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_nortn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB108_3
@@ -11541,10 +11328,9 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB108_2
; GFX1250-GISEL-NEXT: .LBB108_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -11648,11 +11434,9 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB109_3
@@ -11669,9 +11453,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB109_2
; GFX1250-SDAG-NEXT: .LBB109_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -11687,8 +11470,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
;
; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_nortn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -11698,7 +11480,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB109_3
@@ -11715,10 +11497,9 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB109_2
; GFX1250-GISEL-NEXT: .LBB109_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -11834,12 +11615,10 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB110_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %atomicrmw.check.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2
+; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000
; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB110_4
; GFX1250-SDAG-NEXT: ; %bb.2: ; %atomicrmw.global
@@ -11855,9 +11634,7 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-SDAG-NEXT: .LBB110_4:
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: .LBB110_5: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2
+; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1250-SDAG-NEXT: s_cselect_b32 s2, s2, -1
; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s2
@@ -11891,10 +11668,9 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB110_6
; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.check.private
-; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-GISEL-NEXT: s_xor_b32 s2, s1, s2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s2, 0x4000000
; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1
; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB110_3
@@ -11910,9 +11686,7 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB110_5
; GFX1250-GISEL-NEXT: ; %bb.4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, s2
+; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1250-GISEL-NEXT: s_cselect_b32 s2, s2, -1
; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s2
@@ -12060,12 +11834,10 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) {
; GFX1250-SDAG-NEXT: .LBB111_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
; GFX1250-SDAG-NEXT: .LBB111_3: ; %atomicrmw.check.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2
+; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000
; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
; GFX1250-SDAG-NEXT: s_mov_b32 s2, -1
; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB111_5
@@ -12079,9 +11851,7 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) {
; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB111_7
; GFX1250-SDAG-NEXT: ; %bb.6: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2
+; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1250-SDAG-NEXT: s_cselect_b32 s2, s2, -1
; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s2
@@ -12112,9 +11882,8 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) {
; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s1, s3
; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB111_6
; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.check.private
-; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-GISEL-NEXT: s_xor_b32 s2, s1, s2
+; GFX1250-GISEL-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s2, 0x4000000
; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1
; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB111_3
@@ -12130,9 +11899,7 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) {
; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB111_5
; GFX1250-GISEL-NEXT: ; %bb.4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, s2
+; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1250-GISEL-NEXT: s_cselect_b32 s2, s2, -1
; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s2
@@ -12261,9 +12028,8 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50
-; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2
+; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000
; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -12279,10 +12045,9 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-SDAG-NEXT: .LBB112_2:
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: .LBB112_3: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
-; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2
+; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1
; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -12301,10 +12066,9 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50
; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0
-; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4
+; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000
; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1
; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB112_2
@@ -12320,10 +12084,9 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0
; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB112_4
; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
-; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
-; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0
+; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1
; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -12413,9 +12176,8 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) {
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50
-; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2
+; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000
; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -12434,11 +12196,10 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) {
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB113_2
; GFX1250-SDAG-NEXT: .LBB113_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
+; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo
+; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2
-; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1
; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -12454,9 +12215,8 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) {
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50
; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0
-; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4
+; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000
; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1
; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB113_2
@@ -12473,10 +12233,9 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) {
; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0
; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB113_4
; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
-; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
-; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0
+; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1
; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -12562,9 +12321,8 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50
-; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2
+; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000
; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -12580,10 +12338,9 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-SDAG-NEXT: .LBB114_2:
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: .LBB114_3: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
-; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2
+; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1
; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -12602,10 +12359,9 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50
; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0
-; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4
+; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000
; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1
; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB114_2
@@ -12621,10 +12377,9 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0
; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB114_4
; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
-; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
-; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0
+; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1
; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -12714,9 +12469,8 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) {
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50
-; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2
+; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000
; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -12735,11 +12489,10 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) {
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB115_2
; GFX1250-SDAG-NEXT: .LBB115_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
+; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo
+; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2
-; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1
; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -12755,9 +12508,8 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) {
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50
; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0
-; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4
+; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000
; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1
; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB115_2
@@ -12774,10 +12526,9 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) {
; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0
; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB115_4
; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
-; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
-; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0
+; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1
; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
index 2079543..b5b2655 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
@@ -2179,6 +2179,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400
+; GFX1250-SDAG-NEXT: ; kill: killed $sgpr4_sgpr5
; GFX1250-SDAG-NEXT: s_cbranch_scc0 .LBB117_1
; GFX1250-SDAG-NEXT: ; %bb.2: ; %bb2
; GFX1250-SDAG-NEXT: s_endpgm
@@ -2190,15 +2191,16 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX1250-GISEL-NEXT: .LBB117_1: ; %bb3
; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 4, v[2:3]
+; GFX1250-GISEL-NEXT: ; kill: killed $vgpr4 killed $vgpr5
+; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT: flat_load_b32 v6, v[4:5] scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: flat_load_b32 v6, v[4:5] scope:SCOPE_SYS
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: flat_load_b32 v4, v[4:5] scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x400, v2
; GFX1250-GISEL-NEXT: s_cbranch_vccz .LBB117_1
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll
index 4827f75..5e6de6d 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll
@@ -162,32 +162,32 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX1250-LABEL: test_fmax3_olt_0_f32:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-NEXT: s_mov_b32 s10, -1
-; GFX1250-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1250-NEXT: s_mov_b32 s14, s10
-; GFX1250-NEXT: s_mov_b32 s15, s11
-; GFX1250-NEXT: s_mov_b32 s18, s10
-; GFX1250-NEXT: s_mov_b32 s19, s11
-; GFX1250-NEXT: s_mov_b32 s22, s10
-; GFX1250-NEXT: s_mov_b32 s23, s11
+; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s6, s2
+; GFX1250-NEXT: s_mov_b32 s7, s3
+; GFX1250-NEXT: s_mov_b32 s18, s2
+; GFX1250-NEXT: s_mov_b32 s19, s3
+; GFX1250-NEXT: s_mov_b32 s22, s2
+; GFX1250-NEXT: s_mov_b32 s23, s3
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s12, s2
-; GFX1250-NEXT: s_mov_b32 s13, s3
-; GFX1250-NEXT: s_mov_b32 s16, s4
-; GFX1250-NEXT: s_mov_b32 s17, s5
-; GFX1250-NEXT: s_mov_b32 s20, s6
-; GFX1250-NEXT: s_mov_b32 s21, s7
-; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-NEXT: s_mov_b32 s4, s10
+; GFX1250-NEXT: s_mov_b32 s5, s11
+; GFX1250-NEXT: s_mov_b32 s16, s12
+; GFX1250-NEXT: s_mov_b32 s17, s13
+; GFX1250-NEXT: s_mov_b32 s20, s14
+; GFX1250-NEXT: s_mov_b32 s21, s15
+; GFX1250-NEXT: buffer_load_b32 v0, off, s[4:7], null scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s8, s0
-; GFX1250-NEXT: s_mov_b32 s9, s1
+; GFX1250-NEXT: s_mov_b32 s0, s8
+; GFX1250-NEXT: s_mov_b32 s1, s9
; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2
-; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX1250-NEXT: s_endpgm
%a = load volatile float, ptr addrspace(1) %aptr, align 4
%b = load volatile float, ptr addrspace(1) %bptr, align 4
@@ -352,32 +352,32 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX1250-LABEL: test_fmax3_olt_1_f32:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-NEXT: s_mov_b32 s10, -1
-; GFX1250-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1250-NEXT: s_mov_b32 s14, s10
-; GFX1250-NEXT: s_mov_b32 s15, s11
-; GFX1250-NEXT: s_mov_b32 s18, s10
-; GFX1250-NEXT: s_mov_b32 s19, s11
-; GFX1250-NEXT: s_mov_b32 s22, s10
-; GFX1250-NEXT: s_mov_b32 s23, s11
+; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s6, s2
+; GFX1250-NEXT: s_mov_b32 s7, s3
+; GFX1250-NEXT: s_mov_b32 s18, s2
+; GFX1250-NEXT: s_mov_b32 s19, s3
+; GFX1250-NEXT: s_mov_b32 s22, s2
+; GFX1250-NEXT: s_mov_b32 s23, s3
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s12, s2
-; GFX1250-NEXT: s_mov_b32 s13, s3
-; GFX1250-NEXT: s_mov_b32 s16, s4
-; GFX1250-NEXT: s_mov_b32 s17, s5
-; GFX1250-NEXT: s_mov_b32 s20, s6
-; GFX1250-NEXT: s_mov_b32 s21, s7
-; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-NEXT: s_mov_b32 s4, s10
+; GFX1250-NEXT: s_mov_b32 s5, s11
+; GFX1250-NEXT: s_mov_b32 s16, s12
+; GFX1250-NEXT: s_mov_b32 s17, s13
+; GFX1250-NEXT: s_mov_b32 s20, s14
+; GFX1250-NEXT: s_mov_b32 s21, s15
+; GFX1250-NEXT: buffer_load_b32 v0, off, s[4:7], null scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s8, s0
-; GFX1250-NEXT: s_mov_b32 s9, s1
+; GFX1250-NEXT: s_mov_b32 s0, s8
+; GFX1250-NEXT: s_mov_b32 s1, s9
; GFX1250-NEXT: v_max3_num_f32 v0, v2, v0, v1
-; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX1250-NEXT: s_endpgm
%a = load volatile float, ptr addrspace(1) %aptr, align 4
%b = load volatile float, ptr addrspace(1) %bptr, align 4
@@ -609,62 +609,62 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
;
; GFX1250-TRUE16-LABEL: test_fmax3_olt_0_f16:
; GFX1250-TRUE16: ; %bb.0:
-; GFX1250-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-TRUE16-NEXT: s_mov_b32 s10, -1
-; GFX1250-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1250-TRUE16-NEXT: s_mov_b32 s14, s10
-; GFX1250-TRUE16-NEXT: s_mov_b32 s15, s11
-; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s10
-; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s11
-; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s10
-; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s11
+; GFX1250-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-TRUE16-NEXT: s_mov_b32 s2, -1
+; GFX1250-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-TRUE16-NEXT: s_mov_b32 s6, s2
+; GFX1250-TRUE16-NEXT: s_mov_b32 s7, s3
+; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s2
+; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s3
+; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s2
+; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s3
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-TRUE16-NEXT: s_mov_b32 s12, s2
-; GFX1250-TRUE16-NEXT: s_mov_b32 s13, s3
-; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s4
-; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s5
-; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s6
-; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s7
-; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_mov_b32 s4, s10
+; GFX1250-TRUE16-NEXT: s_mov_b32 s5, s11
+; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s12
+; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s13
+; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s14
+; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s15
+; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[4:7], null scope:SCOPE_SYS
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX1250-TRUE16-NEXT: s_mov_b32 s8, s0
-; GFX1250-TRUE16-NEXT: s_mov_b32 s9, s1
+; GFX1250-TRUE16-NEXT: s_mov_b32 s0, s8
+; GFX1250-TRUE16-NEXT: s_mov_b32 s1, s9
; GFX1250-TRUE16-NEXT: v_max3_num_f16 v0.l, v0.l, v1.l, v2.l
-; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
; GFX1250-TRUE16-NEXT: s_endpgm
;
; GFX1250-FAKE16-LABEL: test_fmax3_olt_0_f16:
; GFX1250-FAKE16: ; %bb.0:
-; GFX1250-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-FAKE16-NEXT: s_mov_b32 s10, -1
-; GFX1250-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1250-FAKE16-NEXT: s_mov_b32 s14, s10
-; GFX1250-FAKE16-NEXT: s_mov_b32 s15, s11
-; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s10
-; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s11
-; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s10
-; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s11
+; GFX1250-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX1250-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-FAKE16-NEXT: s_mov_b32 s6, s2
+; GFX1250-FAKE16-NEXT: s_mov_b32 s7, s3
+; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s2
+; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s3
+; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s2
+; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s3
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-FAKE16-NEXT: s_mov_b32 s12, s2
-; GFX1250-FAKE16-NEXT: s_mov_b32 s13, s3
-; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s4
-; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s5
-; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s6
-; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s7
-; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_mov_b32 s4, s10
+; GFX1250-FAKE16-NEXT: s_mov_b32 s5, s11
+; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s12
+; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s13
+; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s14
+; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s15
+; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[4:7], null scope:SCOPE_SYS
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX1250-FAKE16-NEXT: s_mov_b32 s8, s0
-; GFX1250-FAKE16-NEXT: s_mov_b32 s9, s1
+; GFX1250-FAKE16-NEXT: s_mov_b32 s0, s8
+; GFX1250-FAKE16-NEXT: s_mov_b32 s1, s9
; GFX1250-FAKE16-NEXT: v_max3_num_f16 v0, v0, v1, v2
-; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
; GFX1250-FAKE16-NEXT: s_endpgm
%a = load volatile half, ptr addrspace(1) %aptr, align 2
%b = load volatile half, ptr addrspace(1) %bptr, align 2
@@ -897,62 +897,62 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
;
; GFX1250-TRUE16-LABEL: test_fmax3_olt_1_f16:
; GFX1250-TRUE16: ; %bb.0:
-; GFX1250-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-TRUE16-NEXT: s_mov_b32 s10, -1
-; GFX1250-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1250-TRUE16-NEXT: s_mov_b32 s14, s10
-; GFX1250-TRUE16-NEXT: s_mov_b32 s15, s11
-; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s10
-; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s11
-; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s10
-; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s11
+; GFX1250-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-TRUE16-NEXT: s_mov_b32 s2, -1
+; GFX1250-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-TRUE16-NEXT: s_mov_b32 s6, s2
+; GFX1250-TRUE16-NEXT: s_mov_b32 s7, s3
+; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s2
+; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s3
+; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s2
+; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s3
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-TRUE16-NEXT: s_mov_b32 s12, s2
-; GFX1250-TRUE16-NEXT: s_mov_b32 s13, s3
-; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s4
-; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s5
-; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s6
-; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s7
-; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_mov_b32 s4, s10
+; GFX1250-TRUE16-NEXT: s_mov_b32 s5, s11
+; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s12
+; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s13
+; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s14
+; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s15
+; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: buffer_load_u16 v2, off, s[16:19], null scope:SCOPE_SYS
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[20:23], null scope:SCOPE_SYS
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX1250-TRUE16-NEXT: s_mov_b32 s8, s0
-; GFX1250-TRUE16-NEXT: s_mov_b32 s9, s1
+; GFX1250-TRUE16-NEXT: s_mov_b32 s0, s8
+; GFX1250-TRUE16-NEXT: s_mov_b32 s1, s9
; GFX1250-TRUE16-NEXT: v_max3_num_f16 v0.l, v0.l, v1.l, v2.l
-; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
; GFX1250-TRUE16-NEXT: s_endpgm
;
; GFX1250-FAKE16-LABEL: test_fmax3_olt_1_f16:
; GFX1250-FAKE16: ; %bb.0:
-; GFX1250-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-FAKE16-NEXT: s_mov_b32 s10, -1
-; GFX1250-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1250-FAKE16-NEXT: s_mov_b32 s14, s10
-; GFX1250-FAKE16-NEXT: s_mov_b32 s15, s11
-; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s10
-; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s11
-; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s10
-; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s11
+; GFX1250-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX1250-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-FAKE16-NEXT: s_mov_b32 s6, s2
+; GFX1250-FAKE16-NEXT: s_mov_b32 s7, s3
+; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s2
+; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s3
+; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s2
+; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s3
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-FAKE16-NEXT: s_mov_b32 s12, s2
-; GFX1250-FAKE16-NEXT: s_mov_b32 s13, s3
-; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s4
-; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s5
-; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s6
-; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s7
-; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_mov_b32 s4, s10
+; GFX1250-FAKE16-NEXT: s_mov_b32 s5, s11
+; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s12
+; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s13
+; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s14
+; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s15
+; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[4:7], null scope:SCOPE_SYS
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX1250-FAKE16-NEXT: s_mov_b32 s8, s0
-; GFX1250-FAKE16-NEXT: s_mov_b32 s9, s1
+; GFX1250-FAKE16-NEXT: s_mov_b32 s0, s8
+; GFX1250-FAKE16-NEXT: s_mov_b32 s1, s9
; GFX1250-FAKE16-NEXT: v_max3_num_f16 v0, v2, v0, v1
-; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
; GFX1250-FAKE16-NEXT: s_endpgm
%a = load volatile half, ptr addrspace(1) %aptr, align 2
%b = load volatile half, ptr addrspace(1) %bptr, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index 6dfefd8..6a6f232 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -162,32 +162,32 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX1250-LABEL: test_fmin3_olt_0_f32:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-NEXT: s_mov_b32 s10, -1
-; GFX1250-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1250-NEXT: s_mov_b32 s14, s10
-; GFX1250-NEXT: s_mov_b32 s15, s11
-; GFX1250-NEXT: s_mov_b32 s18, s10
-; GFX1250-NEXT: s_mov_b32 s19, s11
-; GFX1250-NEXT: s_mov_b32 s22, s10
-; GFX1250-NEXT: s_mov_b32 s23, s11
+; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s6, s2
+; GFX1250-NEXT: s_mov_b32 s7, s3
+; GFX1250-NEXT: s_mov_b32 s18, s2
+; GFX1250-NEXT: s_mov_b32 s19, s3
+; GFX1250-NEXT: s_mov_b32 s22, s2
+; GFX1250-NEXT: s_mov_b32 s23, s3
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s12, s2
-; GFX1250-NEXT: s_mov_b32 s13, s3
-; GFX1250-NEXT: s_mov_b32 s16, s4
-; GFX1250-NEXT: s_mov_b32 s17, s5
-; GFX1250-NEXT: s_mov_b32 s20, s6
-; GFX1250-NEXT: s_mov_b32 s21, s7
-; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-NEXT: s_mov_b32 s4, s10
+; GFX1250-NEXT: s_mov_b32 s5, s11
+; GFX1250-NEXT: s_mov_b32 s16, s12
+; GFX1250-NEXT: s_mov_b32 s17, s13
+; GFX1250-NEXT: s_mov_b32 s20, s14
+; GFX1250-NEXT: s_mov_b32 s21, s15
+; GFX1250-NEXT: buffer_load_b32 v0, off, s[4:7], null scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s8, s0
-; GFX1250-NEXT: s_mov_b32 s9, s1
+; GFX1250-NEXT: s_mov_b32 s0, s8
+; GFX1250-NEXT: s_mov_b32 s1, s9
; GFX1250-NEXT: v_min3_num_f32 v0, v0, v1, v2
-; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX1250-NEXT: s_endpgm
%a = load volatile float, ptr addrspace(1) %aptr, align 4
%b = load volatile float, ptr addrspace(1) %bptr, align 4
@@ -352,32 +352,32 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX1250-LABEL: test_fmin3_olt_1_f32:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-NEXT: s_mov_b32 s10, -1
-; GFX1250-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1250-NEXT: s_mov_b32 s14, s10
-; GFX1250-NEXT: s_mov_b32 s15, s11
-; GFX1250-NEXT: s_mov_b32 s18, s10
-; GFX1250-NEXT: s_mov_b32 s19, s11
-; GFX1250-NEXT: s_mov_b32 s22, s10
-; GFX1250-NEXT: s_mov_b32 s23, s11
+; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s6, s2
+; GFX1250-NEXT: s_mov_b32 s7, s3
+; GFX1250-NEXT: s_mov_b32 s18, s2
+; GFX1250-NEXT: s_mov_b32 s19, s3
+; GFX1250-NEXT: s_mov_b32 s22, s2
+; GFX1250-NEXT: s_mov_b32 s23, s3
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s12, s2
-; GFX1250-NEXT: s_mov_b32 s13, s3
-; GFX1250-NEXT: s_mov_b32 s16, s4
-; GFX1250-NEXT: s_mov_b32 s17, s5
-; GFX1250-NEXT: s_mov_b32 s20, s6
-; GFX1250-NEXT: s_mov_b32 s21, s7
-; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-NEXT: s_mov_b32 s4, s10
+; GFX1250-NEXT: s_mov_b32 s5, s11
+; GFX1250-NEXT: s_mov_b32 s16, s12
+; GFX1250-NEXT: s_mov_b32 s17, s13
+; GFX1250-NEXT: s_mov_b32 s20, s14
+; GFX1250-NEXT: s_mov_b32 s21, s15
+; GFX1250-NEXT: buffer_load_b32 v0, off, s[4:7], null scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s8, s0
-; GFX1250-NEXT: s_mov_b32 s9, s1
+; GFX1250-NEXT: s_mov_b32 s0, s8
+; GFX1250-NEXT: s_mov_b32 s1, s9
; GFX1250-NEXT: v_min3_num_f32 v0, v2, v0, v1
-; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX1250-NEXT: s_endpgm
%a = load volatile float, ptr addrspace(1) %aptr, align 4
%b = load volatile float, ptr addrspace(1) %bptr, align 4
@@ -609,62 +609,62 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
;
; GFX1250-TRUE16-LABEL: test_fmin3_olt_0_f16:
; GFX1250-TRUE16: ; %bb.0:
-; GFX1250-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-TRUE16-NEXT: s_mov_b32 s10, -1
-; GFX1250-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1250-TRUE16-NEXT: s_mov_b32 s14, s10
-; GFX1250-TRUE16-NEXT: s_mov_b32 s15, s11
-; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s10
-; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s11
-; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s10
-; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s11
+; GFX1250-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-TRUE16-NEXT: s_mov_b32 s2, -1
+; GFX1250-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-TRUE16-NEXT: s_mov_b32 s6, s2
+; GFX1250-TRUE16-NEXT: s_mov_b32 s7, s3
+; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s2
+; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s3
+; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s2
+; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s3
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-TRUE16-NEXT: s_mov_b32 s12, s2
-; GFX1250-TRUE16-NEXT: s_mov_b32 s13, s3
-; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s4
-; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s5
-; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s6
-; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s7
-; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_mov_b32 s4, s10
+; GFX1250-TRUE16-NEXT: s_mov_b32 s5, s11
+; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s12
+; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s13
+; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s14
+; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s15
+; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[4:7], null scope:SCOPE_SYS
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX1250-TRUE16-NEXT: s_mov_b32 s8, s0
-; GFX1250-TRUE16-NEXT: s_mov_b32 s9, s1
+; GFX1250-TRUE16-NEXT: s_mov_b32 s0, s8
+; GFX1250-TRUE16-NEXT: s_mov_b32 s1, s9
; GFX1250-TRUE16-NEXT: v_min3_num_f16 v0.l, v0.l, v1.l, v2.l
-; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
; GFX1250-TRUE16-NEXT: s_endpgm
;
; GFX1250-FAKE16-LABEL: test_fmin3_olt_0_f16:
; GFX1250-FAKE16: ; %bb.0:
-; GFX1250-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-FAKE16-NEXT: s_mov_b32 s10, -1
-; GFX1250-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1250-FAKE16-NEXT: s_mov_b32 s14, s10
-; GFX1250-FAKE16-NEXT: s_mov_b32 s15, s11
-; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s10
-; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s11
-; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s10
-; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s11
+; GFX1250-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX1250-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-FAKE16-NEXT: s_mov_b32 s6, s2
+; GFX1250-FAKE16-NEXT: s_mov_b32 s7, s3
+; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s2
+; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s3
+; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s2
+; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s3
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-FAKE16-NEXT: s_mov_b32 s12, s2
-; GFX1250-FAKE16-NEXT: s_mov_b32 s13, s3
-; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s4
-; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s5
-; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s6
-; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s7
-; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_mov_b32 s4, s10
+; GFX1250-FAKE16-NEXT: s_mov_b32 s5, s11
+; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s12
+; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s13
+; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s14
+; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s15
+; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[4:7], null scope:SCOPE_SYS
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX1250-FAKE16-NEXT: s_mov_b32 s8, s0
-; GFX1250-FAKE16-NEXT: s_mov_b32 s9, s1
+; GFX1250-FAKE16-NEXT: s_mov_b32 s0, s8
+; GFX1250-FAKE16-NEXT: s_mov_b32 s1, s9
; GFX1250-FAKE16-NEXT: v_min3_num_f16 v0, v0, v1, v2
-; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
; GFX1250-FAKE16-NEXT: s_endpgm
%a = load volatile half, ptr addrspace(1) %aptr, align 2
%b = load volatile half, ptr addrspace(1) %bptr, align 2
@@ -897,62 +897,62 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
;
; GFX1250-TRUE16-LABEL: test_fmin3_olt_1_f16:
; GFX1250-TRUE16: ; %bb.0:
-; GFX1250-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-TRUE16-NEXT: s_mov_b32 s10, -1
-; GFX1250-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1250-TRUE16-NEXT: s_mov_b32 s14, s10
-; GFX1250-TRUE16-NEXT: s_mov_b32 s15, s11
-; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s10
-; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s11
-; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s10
-; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s11
+; GFX1250-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-TRUE16-NEXT: s_mov_b32 s2, -1
+; GFX1250-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-TRUE16-NEXT: s_mov_b32 s6, s2
+; GFX1250-TRUE16-NEXT: s_mov_b32 s7, s3
+; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s2
+; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s3
+; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s2
+; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s3
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-TRUE16-NEXT: s_mov_b32 s12, s2
-; GFX1250-TRUE16-NEXT: s_mov_b32 s13, s3
-; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s4
-; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s5
-; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s6
-; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s7
-; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_mov_b32 s4, s10
+; GFX1250-TRUE16-NEXT: s_mov_b32 s5, s11
+; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s12
+; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s13
+; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s14
+; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s15
+; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: buffer_load_u16 v2, off, s[16:19], null scope:SCOPE_SYS
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[20:23], null scope:SCOPE_SYS
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX1250-TRUE16-NEXT: s_mov_b32 s8, s0
-; GFX1250-TRUE16-NEXT: s_mov_b32 s9, s1
+; GFX1250-TRUE16-NEXT: s_mov_b32 s0, s8
+; GFX1250-TRUE16-NEXT: s_mov_b32 s1, s9
; GFX1250-TRUE16-NEXT: v_min3_num_f16 v0.l, v0.l, v1.l, v2.l
-; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
; GFX1250-TRUE16-NEXT: s_endpgm
;
; GFX1250-FAKE16-LABEL: test_fmin3_olt_1_f16:
; GFX1250-FAKE16: ; %bb.0:
-; GFX1250-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-FAKE16-NEXT: s_mov_b32 s10, -1
-; GFX1250-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1250-FAKE16-NEXT: s_mov_b32 s14, s10
-; GFX1250-FAKE16-NEXT: s_mov_b32 s15, s11
-; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s10
-; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s11
-; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s10
-; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s11
+; GFX1250-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX1250-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-FAKE16-NEXT: s_mov_b32 s6, s2
+; GFX1250-FAKE16-NEXT: s_mov_b32 s7, s3
+; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s2
+; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s3
+; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s2
+; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s3
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-FAKE16-NEXT: s_mov_b32 s12, s2
-; GFX1250-FAKE16-NEXT: s_mov_b32 s13, s3
-; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s4
-; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s5
-; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s6
-; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s7
-; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_mov_b32 s4, s10
+; GFX1250-FAKE16-NEXT: s_mov_b32 s5, s11
+; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s12
+; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s13
+; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s14
+; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s15
+; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[4:7], null scope:SCOPE_SYS
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX1250-FAKE16-NEXT: s_mov_b32 s8, s0
-; GFX1250-FAKE16-NEXT: s_mov_b32 s9, s1
+; GFX1250-FAKE16-NEXT: s_mov_b32 s0, s8
+; GFX1250-FAKE16-NEXT: s_mov_b32 s1, s9
; GFX1250-FAKE16-NEXT: v_min3_num_f16 v0, v2, v0, v1
-; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
; GFX1250-FAKE16-NEXT: s_endpgm
%a = load volatile half, ptr addrspace(1) %aptr, align 2
%b = load volatile half, ptr addrspace(1) %bptr, align 2
@@ -1217,36 +1217,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs
;
; GFX1250-LABEL: test_fmin3_olt_0_f64:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-NEXT: s_mov_b32 s10, -1
-; GFX1250-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1250-NEXT: s_mov_b32 s14, s10
-; GFX1250-NEXT: s_mov_b32 s15, s11
-; GFX1250-NEXT: s_mov_b32 s18, s10
-; GFX1250-NEXT: s_mov_b32 s19, s11
+; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s6, s2
+; GFX1250-NEXT: s_mov_b32 s7, s3
+; GFX1250-NEXT: s_mov_b32 s18, s2
+; GFX1250-NEXT: s_mov_b32 s19, s3
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s12, s2
-; GFX1250-NEXT: s_mov_b32 s13, s3
-; GFX1250-NEXT: s_mov_b32 s16, s4
-; GFX1250-NEXT: s_mov_b32 s17, s5
-; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-NEXT: s_mov_b32 s4, s10
+; GFX1250-NEXT: s_mov_b32 s5, s11
+; GFX1250-NEXT: s_mov_b32 s16, s12
+; GFX1250-NEXT: s_mov_b32 s17, s13
+; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x1
-; GFX1250-NEXT: s_mov_b32 s12, s6
-; GFX1250-NEXT: s_mov_b32 s13, s7
-; GFX1250-NEXT: s_mov_b32 s8, s0
-; GFX1250-NEXT: buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-NEXT: s_mov_b32 s4, s14
+; GFX1250-NEXT: s_mov_b32 s5, s15
+; GFX1250-NEXT: s_mov_b32 s0, s8
+; GFX1250-NEXT: buffer_load_b64 v[4:5], off, s[4:7], null scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s9, s1
+; GFX1250-NEXT: s_mov_b32 s1, s9
; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX1250-NEXT: s_endpgm
%a = load volatile double, ptr addrspace(1) %aptr, align 4
%b = load volatile double, ptr addrspace(1) %bptr, align 4
@@ -1427,36 +1427,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs
;
; GFX1250-LABEL: test_fmin3_olt_1_f64:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-NEXT: s_mov_b32 s10, -1
-; GFX1250-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1250-NEXT: s_mov_b32 s14, s10
-; GFX1250-NEXT: s_mov_b32 s15, s11
-; GFX1250-NEXT: s_mov_b32 s18, s10
-; GFX1250-NEXT: s_mov_b32 s19, s11
+; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s6, s2
+; GFX1250-NEXT: s_mov_b32 s7, s3
+; GFX1250-NEXT: s_mov_b32 s18, s2
+; GFX1250-NEXT: s_mov_b32 s19, s3
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s12, s2
-; GFX1250-NEXT: s_mov_b32 s13, s3
-; GFX1250-NEXT: s_mov_b32 s16, s4
-; GFX1250-NEXT: s_mov_b32 s17, s5
-; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-NEXT: s_mov_b32 s4, s10
+; GFX1250-NEXT: s_mov_b32 s5, s11
+; GFX1250-NEXT: s_mov_b32 s16, s12
+; GFX1250-NEXT: s_mov_b32 s17, s13
+; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x1
-; GFX1250-NEXT: s_mov_b32 s12, s6
-; GFX1250-NEXT: s_mov_b32 s13, s7
-; GFX1250-NEXT: s_mov_b32 s8, s0
-; GFX1250-NEXT: buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-NEXT: s_mov_b32 s4, s14
+; GFX1250-NEXT: s_mov_b32 s5, s15
+; GFX1250-NEXT: s_mov_b32 s0, s8
+; GFX1250-NEXT: buffer_load_b64 v[4:5], off, s[4:7], null scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s9, s1
+; GFX1250-NEXT: s_mov_b32 s1, s9
; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1]
-; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX1250-NEXT: s_endpgm
%a = load volatile double, ptr addrspace(1) %aptr, align 4
%b = load volatile double, ptr addrspace(1) %bptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
index e532dea..f807169 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -11,22 +11,20 @@ define void @test_i8load_v4i8store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-SDAG: ; %bb.0:
; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
-; GCN-SDAG-NEXT: global_load_u8 v2, v[2:3], off
-; GCN-SDAG-NEXT: global_load_u8 v3, v[4:5], off
-; GCN-SDAG-NEXT: global_load_u8 v0, v[0:1], off
+; GCN-SDAG-NEXT: global_load_u8 v6, v[2:3], off
+; GCN-SDAG-NEXT: global_load_u8 v7, v[4:5], off
+; GCN-SDAG-NEXT: global_load_u8 v10, v[0:1], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x2
; GCN-SDAG-NEXT: s_wait_xcnt 0x0
-; GCN-SDAG-NEXT: v_lshlrev_b16 v1, 8, v2
+; GCN-SDAG-NEXT: v_lshlrev_b16 v0, 8, v6
; GCN-SDAG-NEXT: s_wait_loadcnt 0x1
-; GCN-SDAG-NEXT: v_lshlrev_b16 v2, 8, v3
+; GCN-SDAG-NEXT: v_lshlrev_b16 v1, 8, v7
+; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GCN-SDAG-NEXT: v_or_b32_e32 v1, v7, v1
; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GCN-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; GCN-SDAG-NEXT: v_or_b32_e32 v1, v3, v2
-; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GCN-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-SDAG-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_bitop2_b32 v0, v10, v0 bitop3:0x54
+; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GCN-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-SDAG-NEXT: global_store_b32 v[8:9], v0, off
; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31]
@@ -35,13 +33,15 @@ define void @test_i8load_v4i8store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-GISEL: ; %bb.0:
; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
-; GCN-GISEL-NEXT: global_load_u8 v0, v[0:1], off
-; GCN-GISEL-NEXT: global_load_u8 v1, v[2:3], off
-; GCN-GISEL-NEXT: global_load_u8 v2, v[4:5], off
+; GCN-GISEL-NEXT: global_load_u8 v6, v[0:1], off
+; GCN-GISEL-NEXT: global_load_u8 v7, v[2:3], off
+; GCN-GISEL-NEXT: global_load_u8 v10, v[4:5], off
; GCN-GISEL-NEXT: s_wait_loadcnt 0x1
-; GCN-GISEL-NEXT: v_lshl_or_b32 v0, v1, 8, v0
+; GCN-GISEL-NEXT: s_wait_xcnt 0x2
+; GCN-GISEL-NEXT: v_lshl_or_b32 v0, v7, 8, v6
; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT: v_dual_lshlrev_b32 v1, 16, v2 :: v_dual_lshlrev_b32 v2, 24, v2
+; GCN-GISEL-NEXT: s_wait_xcnt 0x1
+; GCN-GISEL-NEXT: v_dual_lshlrev_b32 v1, 16, v10 :: v_dual_lshlrev_b32 v2, 24, v10
; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
; GCN-GISEL-NEXT: global_store_b32 v[8:9], v0, off
@@ -64,21 +64,21 @@ define i16 @test_v7i16_load_store(ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2
; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
; GCN-SDAG-NEXT: global_load_b128 v[4:7], v[0:1], off
-; GCN-SDAG-NEXT: global_load_b128 v[0:3], v[2:3], off
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[8:9], 0
-; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT: v_pk_add_u16 v10, v6, v2
-; GCN-SDAG-NEXT: v_pk_add_u16 v11, v7, v3
+; GCN-SDAG-NEXT: global_load_b128 v[8:11], v[2:3], off
; GCN-SDAG-NEXT: s_wait_xcnt 0x0
; GCN-SDAG-NEXT: v_mov_b64_e32 v[2:3], 12
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT: v_pk_add_u16 v1, v6, v10
+; GCN-SDAG-NEXT: v_pk_add_u16 v12, v7, v11
; GCN-SDAG-NEXT: v_mov_b64_e32 v[6:7], 8
-; GCN-SDAG-NEXT: v_pk_add_u16 v4, v4, v0
-; GCN-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v10
-; GCN-SDAG-NEXT: v_pk_add_u16 v5, v5, v1
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[10:11], 0
+; GCN-SDAG-NEXT: v_pk_add_u16 v5, v5, v9
+; GCN-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GCN-SDAG-NEXT: v_pk_add_u16 v4, v4, v8
; GCN-SDAG-NEXT: s_clause 0x2
-; GCN-SDAG-NEXT: global_store_b16 v[2:3], v11, off
-; GCN-SDAG-NEXT: global_store_b32 v[6:7], v10, off
-; GCN-SDAG-NEXT: global_store_b64 v[8:9], v[4:5], off
+; GCN-SDAG-NEXT: global_store_b16 v[2:3], v12, off
+; GCN-SDAG-NEXT: global_store_b32 v[6:7], v1, off
+; GCN-SDAG-NEXT: global_store_b64 v[10:11], v[4:5], off
; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31]
;
; GCN-GISEL-LABEL: test_v7i16_load_store:
@@ -86,28 +86,29 @@ define i16 @test_v7i16_load_store(ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2
; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
; GCN-GISEL-NEXT: global_load_b128 v[4:7], v[0:1], off
-; GCN-GISEL-NEXT: global_load_b128 v[0:3], v[2:3], off
-; GCN-GISEL-NEXT: v_mov_b64_e32 v[8:9], 0
-; GCN-GISEL-NEXT: v_mov_b64_e32 v[10:11], 2
-; GCN-GISEL-NEXT: v_mov_b64_e32 v[12:13], 4
-; GCN-GISEL-NEXT: v_mov_b64_e32 v[14:15], 6
-; GCN-GISEL-NEXT: v_mov_b64_e32 v[16:17], 8
-; GCN-GISEL-NEXT: v_mov_b64_e32 v[18:19], 10
-; GCN-GISEL-NEXT: v_mov_b64_e32 v[20:21], 12
+; GCN-GISEL-NEXT: global_load_b128 v[8:11], v[2:3], off
+; GCN-GISEL-NEXT: s_wait_xcnt 0x0
+; GCN-GISEL-NEXT: v_mov_b64_e32 v[2:3], 0
+; GCN-GISEL-NEXT: v_mov_b64_e32 v[12:13], 2
+; GCN-GISEL-NEXT: v_mov_b64_e32 v[14:15], 4
+; GCN-GISEL-NEXT: v_mov_b64_e32 v[16:17], 6
+; GCN-GISEL-NEXT: v_mov_b64_e32 v[18:19], 8
+; GCN-GISEL-NEXT: v_mov_b64_e32 v[20:21], 10
+; GCN-GISEL-NEXT: v_mov_b64_e32 v[22:23], 12
; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT: v_pk_add_u16 v2, v6, v2
-; GCN-GISEL-NEXT: v_pk_add_u16 v4, v4, v0
-; GCN-GISEL-NEXT: v_pk_add_u16 v1, v5, v1
-; GCN-GISEL-NEXT: v_pk_add_u16 v3, v7, v3
+; GCN-GISEL-NEXT: v_pk_add_u16 v1, v6, v10
+; GCN-GISEL-NEXT: v_pk_add_u16 v4, v4, v8
+; GCN-GISEL-NEXT: v_pk_add_u16 v5, v5, v9
+; GCN-GISEL-NEXT: v_pk_add_u16 v6, v7, v11
; GCN-GISEL-NEXT: s_clause 0x6
-; GCN-GISEL-NEXT: global_store_b16 v[8:9], v4, off
-; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[10:11], v4, off
-; GCN-GISEL-NEXT: global_store_b16 v[12:13], v1, off
-; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[14:15], v1, off
-; GCN-GISEL-NEXT: global_store_b16 v[16:17], v2, off
-; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[18:19], v2, off
-; GCN-GISEL-NEXT: global_store_b16 v[20:21], v3, off
-; GCN-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GCN-GISEL-NEXT: global_store_b16 v[2:3], v4, off
+; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[12:13], v4, off
+; GCN-GISEL-NEXT: global_store_b16 v[14:15], v5, off
+; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[16:17], v5, off
+; GCN-GISEL-NEXT: global_store_b16 v[18:19], v1, off
+; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[20:21], v1, off
+; GCN-GISEL-NEXT: global_store_b16 v[22:23], v6, off
+; GCN-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31]
%vec1 = load <7 x i16>, ptr addrspace(1) %ptr1
%insert = insertelement <7 x i16> %vec1, i16 20, i32 4
@@ -253,8 +254,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-SDAG-NEXT: global_load_b128 v[22:25], v[0:1], off offset:32
; GCN-SDAG-NEXT: global_load_b128 v[26:29], v[0:1], off offset:16
; GCN-SDAG-NEXT: global_load_b128 v[30:33], v[0:1], off
-; GCN-SDAG-NEXT: global_load_b128 v[0:3], v[0:1], off offset:64
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[36:37], 0x70
+; GCN-SDAG-NEXT: global_load_b128 v[34:37], v[0:1], off offset:64
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x70
; GCN-SDAG-NEXT: v_mov_b64_e32 v[48:49], 48
; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x60
; GCN-SDAG-NEXT: v_mov_b64_e32 v[50:51], 32
@@ -262,14 +263,15 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-SDAG-NEXT: v_mov_b64_e32 v[66:67], 0
; GCN-SDAG-NEXT: v_mov_b64_e32 v[52:53], 0x50
; GCN-SDAG-NEXT: v_mov_b64_e32 v[54:55], 64
-; GCN-SDAG-NEXT: v_dual_mov_b32 v34, 0xc8 :: v_dual_mov_b32 v35, 0
+; GCN-SDAG-NEXT: s_wait_xcnt 0x0
+; GCN-SDAG-NEXT: v_dual_mov_b32 v0, 0xc8 :: v_dual_mov_b32 v1, 0
; GCN-SDAG-NEXT: s_wait_loadcnt 0x7
-; GCN-SDAG-NEXT: global_store_b128 v[36:37], v[6:9], off
+; GCN-SDAG-NEXT: global_store_b128 v[2:3], v[6:9], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x6
; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[10:13], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x5
; GCN-SDAG-NEXT: s_wait_xcnt 0x1
-; GCN-SDAG-NEXT: v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v37, v17
+; GCN-SDAG-NEXT: v_dual_mov_b32 v2, v16 :: v_dual_mov_b32 v3, v17
; GCN-SDAG-NEXT: s_wait_xcnt 0x0
; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13]
; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11]
@@ -286,8 +288,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[8:9], v[8:9], v[8:9]
; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[6:7], v[6:7], v[6:7]
; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[50:51], v[2:3], v[2:3]
-; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[48:49], v[0:1], v[0:1]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[50:51], v[36:37], v[36:37]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[48:49], v[34:35], v[34:35]
; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[16:17], v[16:17], v[16:17]
; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[14:15], 0xc8, v[14:15]
; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[24:25], 0x64, v[24:25]
@@ -298,8 +300,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[20:21], v[20:21], v[20:21]
; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[18:19], v[18:19], v[18:19]
; GCN-SDAG-NEXT: s_clause 0x1
-; GCN-SDAG-NEXT: global_store_b128 v[52:53], v[34:37], off
-; GCN-SDAG-NEXT: global_store_b128 v[54:55], v[0:3], off
+; GCN-SDAG-NEXT: global_store_b128 v[52:53], v[0:3], off
+; GCN-SDAG-NEXT: global_store_b128 v[54:55], v[34:37], off
; GCN-SDAG-NEXT: s_clause 0x7
; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:96
; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off offset:112
@@ -309,7 +311,7 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[18:21], off offset:48
; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[30:33], off
; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[26:29], off offset:16
-; GCN-SDAG-NEXT: s_wait_xcnt 0x8
+; GCN-SDAG-NEXT: s_wait_xcnt 0x9
; GCN-SDAG-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v33
; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31]
;
@@ -325,7 +327,7 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-GISEL-NEXT: global_load_b128 v[22:25], v[0:1], off offset:48
; GCN-GISEL-NEXT: global_load_b128 v[26:29], v[0:1], off offset:96
; GCN-GISEL-NEXT: global_load_b128 v[30:33], v[0:1], off offset:112
-; GCN-GISEL-NEXT: global_load_b128 v[0:3], v[0:1], off offset:64
+; GCN-GISEL-NEXT: global_load_b128 v[34:37], v[0:1], off offset:64
; GCN-GISEL-NEXT: v_mov_b64_e32 v[38:39], 0
; GCN-GISEL-NEXT: v_mov_b64_e32 v[48:49], 16
; GCN-GISEL-NEXT: v_mov_b64_e32 v[50:51], 32
@@ -333,7 +335,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-GISEL-NEXT: v_mov_b64_e32 v[66:67], 0x60
; GCN-GISEL-NEXT: v_mov_b64_e32 v[68:69], 0x70
; GCN-GISEL-NEXT: v_mov_b64_e32 v[54:55], 64
-; GCN-GISEL-NEXT: v_mov_b64_e32 v[34:35], 0xc8
+; GCN-GISEL-NEXT: s_wait_xcnt 0x0
+; GCN-GISEL-NEXT: v_mov_b64_e32 v[0:1], 0xc8
; GCN-GISEL-NEXT: v_mov_b64_e32 v[64:65], 0x50
; GCN-GISEL-NEXT: s_wait_loadcnt 0x6
; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[10:13], off
@@ -349,7 +352,7 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-GISEL-NEXT: global_store_b128 v[68:69], v[30:33], off
; GCN-GISEL-NEXT: s_wait_xcnt 0x5
; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13]
-; GCN-GISEL-NEXT: v_mov_b64_e32 v[36:37], v[8:9]
+; GCN-GISEL-NEXT: v_mov_b64_e32 v[2:3], v[8:9]
; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11]
; GCN-GISEL-NEXT: s_wait_xcnt 0x4
; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[14:15], v[14:15], v[14:15]
@@ -361,8 +364,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[22:23], v[22:23], v[22:23]
; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[24:25], v[24:25], v[24:25]
; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[48:49], v[0:1], v[0:1]
-; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[50:51], v[2:3], v[2:3]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[48:49], v[34:35], v[34:35]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[50:51], v[36:37], v[36:37]
; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[6:7], 0xc8, v[6:7]
; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[8:9], v[8:9], v[8:9]
; GCN-GISEL-NEXT: s_wait_xcnt 0x1
@@ -372,8 +375,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[30:31], v[30:31], v[30:31]
; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33]
; GCN-GISEL-NEXT: s_clause 0x1
-; GCN-GISEL-NEXT: global_store_b128 v[54:55], v[0:3], off
-; GCN-GISEL-NEXT: global_store_b128 v[64:65], v[34:37], off
+; GCN-GISEL-NEXT: global_store_b128 v[54:55], v[34:37], off
+; GCN-GISEL-NEXT: global_store_b128 v[64:65], v[0:3], off
; GCN-GISEL-NEXT: s_clause 0x7
; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[10:13], off
; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[14:17], off offset:16
@@ -383,7 +386,7 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off offset:80
; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[26:29], off offset:96
; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[30:33], off offset:112
-; GCN-GISEL-NEXT: s_wait_xcnt 0x9
+; GCN-GISEL-NEXT: s_wait_xcnt 0x8
; GCN-GISEL-NEXT: v_dual_mov_b32 v0, v12 :: v_dual_mov_b32 v1, v13
; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31]
%a = load <16 x i64>, ptr addrspace(1) %ptr_a, align 4
@@ -402,16 +405,17 @@ define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1,
; GCN-SDAG-LABEL: test_v7i16_load_store_kernel:
; GCN-SDAG: ; %bb.0:
; GCN-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GCN-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GCN-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GCN-SDAG-NEXT: s_wait_xcnt 0x0
; GCN-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[8:9], 12
; GCN-SDAG-NEXT: v_mov_b64_e32 v[10:11], 8
; GCN-SDAG-NEXT: v_mov_b64_e32 v[12:13], 0
; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
; GCN-SDAG-NEXT: s_clause 0x1
-; GCN-SDAG-NEXT: global_load_b128 v[0:3], v4, s[0:1] scale_offset
-; GCN-SDAG-NEXT: global_load_b128 v[4:7], v4, s[2:3] scale_offset
+; GCN-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
+; GCN-SDAG-NEXT: global_load_b128 v[4:7], v8, s[2:3] scale_offset
+; GCN-SDAG-NEXT: s_wait_xcnt 0x0
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[8:9], 12
; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
; GCN-SDAG-NEXT: v_pk_add_u16 v3, v3, v7
; GCN-SDAG-NEXT: v_pk_add_u16 v2, v2, v6
@@ -428,10 +432,9 @@ define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1,
; GCN-GISEL-LABEL: test_v7i16_load_store_kernel:
; GCN-GISEL: ; %bb.0:
; GCN-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GCN-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GCN-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GCN-GISEL-NEXT: s_wait_xcnt 0x0
; GCN-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
-; GCN-GISEL-NEXT: v_mov_b64_e32 v[8:9], 0
; GCN-GISEL-NEXT: v_mov_b64_e32 v[10:11], 2
; GCN-GISEL-NEXT: v_mov_b64_e32 v[12:13], 4
; GCN-GISEL-NEXT: v_mov_b64_e32 v[14:15], 6
@@ -440,8 +443,10 @@ define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1,
; GCN-GISEL-NEXT: v_mov_b64_e32 v[20:21], 12
; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
; GCN-GISEL-NEXT: s_clause 0x1
-; GCN-GISEL-NEXT: global_load_b128 v[0:3], v4, s[0:1] scale_offset
-; GCN-GISEL-NEXT: global_load_b128 v[4:7], v4, s[2:3] scale_offset
+; GCN-GISEL-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
+; GCN-GISEL-NEXT: global_load_b128 v[4:7], v8, s[2:3] scale_offset
+; GCN-GISEL-NEXT: s_wait_xcnt 0x0
+; GCN-GISEL-NEXT: v_mov_b64_e32 v[8:9], 0
; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
; GCN-GISEL-NEXT: v_pk_add_u16 v0, v0, v4
; GCN-GISEL-NEXT: v_pk_add_u16 v1, v1, v5
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
index 92836d8..63db24a 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
@@ -486,7 +486,7 @@ body: |
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
- ; CHECK-NEXT: INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; CHECK-NEXT: S_ENDPGM 0
bb.0:
S_NOP 0, implicit-def $agpr0
@@ -516,7 +516,7 @@ body: |
S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
- INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2
+ INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2
S_ENDPGM 0
...
@@ -1368,7 +1368,7 @@ body: |
; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
; CHECK-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
@@ -1408,7 +1408,7 @@ body: |
undef %2.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
early-clobber %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %2, 0, 0, 0, implicit $mode, implicit $exec
early-clobber %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, %4
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, %4
S_CBRANCH_VCCNZ %bb.1, implicit $vcc
S_BRANCH %bb.2
@@ -1726,7 +1726,7 @@ body: |
; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
@@ -1763,7 +1763,7 @@ body: |
undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
%0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
%4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, %4
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, %4
S_CBRANCH_VCCNZ %bb.1, implicit $vcc
S_BRANCH %bb.2
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
index 9cbdc38..5b3e486 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
@@ -8,16 +8,16 @@
define amdgpu_kernel void @s_input_output_i128() {
; GFX908-LABEL: name: s_input_output_i128
; GFX908: bb.0 (%ir-block.0):
- ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9633802 /* regdef:SGPR_128 */, def %13
+ ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10682378 /* regdef:SGPR_128 */, def %13
; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %13
- ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9633801 /* reguse:SGPR_128 */, [[COPY]]
+ ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10682377 /* reguse:SGPR_128 */, [[COPY]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: s_input_output_i128
; GFX90A: bb.0 (%ir-block.0):
- ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9633802 /* regdef:SGPR_128 */, def %11
+ ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10682378 /* regdef:SGPR_128 */, def %11
; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %11
- ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9633801 /* reguse:SGPR_128 */, [[COPY]]
+ ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10682377 /* reguse:SGPR_128 */, [[COPY]]
; GFX90A-NEXT: S_ENDPGM 0
%val = tail call i128 asm sideeffect "; def $0", "=s"()
call void asm sideeffect "; use $0", "s"(i128 %val)
@@ -27,16 +27,16 @@ define amdgpu_kernel void @s_input_output_i128() {
define amdgpu_kernel void @v_input_output_i128() {
; GFX908-LABEL: name: v_input_output_i128
; GFX908: bb.0 (%ir-block.0):
- ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7798794 /* regdef:VReg_128 */, def %13
+ ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128 */, def %13
; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %13
- ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7798793 /* reguse:VReg_128 */, [[COPY]]
+ ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7929865 /* reguse:VReg_128 */, [[COPY]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: v_input_output_i128
; GFX90A: bb.0 (%ir-block.0):
- ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def %11
+ ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8257546 /* regdef:VReg_128_Align2 */, def %11
; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %11
- ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7995401 /* reguse:VReg_128_Align2 */, [[COPY]]
+ ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8257545 /* reguse:VReg_128_Align2 */, [[COPY]]
; GFX90A-NEXT: S_ENDPGM 0
%val = tail call i128 asm sideeffect "; def $0", "=v"()
call void asm sideeffect "; use $0", "v"(i128 %val)
@@ -47,16 +47,16 @@ define amdgpu_kernel void @a_input_output_i128() {
; GFX908-LABEL: name: a_input_output_i128
; GFX908: bb.0 (%ir-block.0):
- ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8323082 /* regdef:AReg_128 */, def %13
+ ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8847370 /* regdef:AReg_128 */, def %13
; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %13
- ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY]]
+ ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: a_input_output_i128
; GFX90A: bb.0 (%ir-block.0):
- ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8650762 /* regdef:AReg_128_Align2 */, def %11
+ ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9568266 /* regdef:AReg_128_Align2 */, def %11
; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %11
- ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY]]
+ ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY]]
; GFX90A-NEXT: S_ENDPGM 0
%val = call i128 asm sideeffect "; def $0", "=a"()
call void asm sideeffect "; use $0", "a"(i128 %val)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll
index 90fcb51..fa97380 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll
@@ -11,14 +11,11 @@ declare i32 @llvm.amdgcn.cluster.id.z() #0
define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) {
; CHECK-UNKNOWN-LABEL: test_cluster_id_x:
; CHECK-UNKNOWN: ; %bb.0:
-; CHECK-UNKNOWN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; CHECK-UNKNOWN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; CHECK-UNKNOWN-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
; CHECK-UNKNOWN-NEXT: s_wait_kmcnt 0x0
-; CHECK-UNKNOWN-NEXT: global_store_b32 v1, v0, s[0:1]
+; CHECK-UNKNOWN-NEXT: global_store_b32 v1, v0, s[2:3]
; CHECK-UNKNOWN-NEXT: s_endpgm
-; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
-; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
-; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
;
; CHECK-MESA3D-LABEL: test_cluster_id_x:
; CHECK-MESA3D: .amd_kernel_code_t
@@ -68,7 +65,7 @@ define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) {
; CHECK-MESA3D-NEXT: is_ptr64 = 1
; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -98,14 +95,11 @@ define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) {
;
; CHECK-G-UNKNOWN-LABEL: test_cluster_id_x:
; CHECK-G-UNKNOWN: ; %bb.0:
-; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; CHECK-G-UNKNOWN-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
; CHECK-G-UNKNOWN-NEXT: s_wait_kmcnt 0x0
-; CHECK-G-UNKNOWN-NEXT: global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT: global_store_b32 v1, v0, s[2:3]
; CHECK-G-UNKNOWN-NEXT: s_endpgm
-; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
-; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
-; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
;
; CHECK-G-MESA3D-LABEL: test_cluster_id_x:
; CHECK-G-MESA3D: .amd_kernel_code_t
@@ -155,7 +149,7 @@ define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) {
; CHECK-G-MESA3D-NEXT: is_ptr64 = 1
; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -190,14 +184,11 @@ define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) {
define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 {
; CHECK-UNKNOWN-LABEL: test_cluster_id_y:
; CHECK-UNKNOWN: ; %bb.0:
-; CHECK-UNKNOWN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; CHECK-UNKNOWN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; CHECK-UNKNOWN-NEXT: v_dual_mov_b32 v0, ttmp7 :: v_dual_mov_b32 v1, 0
; CHECK-UNKNOWN-NEXT: s_wait_kmcnt 0x0
-; CHECK-UNKNOWN-NEXT: global_store_b32 v1, v0, s[0:1]
+; CHECK-UNKNOWN-NEXT: global_store_b32 v1, v0, s[2:3]
; CHECK-UNKNOWN-NEXT: s_endpgm
-; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
-; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
-; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
;
; CHECK-MESA3D-LABEL: test_cluster_id_y:
; CHECK-MESA3D: .amd_kernel_code_t
@@ -247,7 +238,7 @@ define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 {
; CHECK-MESA3D-NEXT: is_ptr64 = 1
; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -277,14 +268,11 @@ define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 {
;
; CHECK-G-UNKNOWN-LABEL: test_cluster_id_y:
; CHECK-G-UNKNOWN: ; %bb.0:
-; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; CHECK-G-UNKNOWN-NEXT: v_dual_mov_b32 v0, ttmp7 :: v_dual_mov_b32 v1, 0
; CHECK-G-UNKNOWN-NEXT: s_wait_kmcnt 0x0
-; CHECK-G-UNKNOWN-NEXT: global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT: global_store_b32 v1, v0, s[2:3]
; CHECK-G-UNKNOWN-NEXT: s_endpgm
-; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
-; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
-; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
;
; CHECK-G-MESA3D-LABEL: test_cluster_id_y:
; CHECK-G-MESA3D: .amd_kernel_code_t
@@ -334,7 +322,7 @@ define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 {
; CHECK-G-MESA3D-NEXT: is_ptr64 = 1
; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -369,16 +357,14 @@ define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 {
define amdgpu_kernel void @test_cluster_id_z(ptr addrspace(1) %out) #1 {
; CHECK-UNKNOWN-LABEL: test_cluster_id_z:
; CHECK-UNKNOWN: ; %bb.0:
-; CHECK-UNKNOWN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; CHECK-UNKNOWN-NEXT: s_lshr_b32 s2, ttmp7, 16
+; CHECK-UNKNOWN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; CHECK-UNKNOWN-NEXT: s_wait_xcnt 0x0
+; CHECK-UNKNOWN-NEXT: s_lshr_b32 s0, ttmp7, 16
; CHECK-UNKNOWN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-UNKNOWN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-UNKNOWN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; CHECK-UNKNOWN-NEXT: s_wait_kmcnt 0x0
-; CHECK-UNKNOWN-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-UNKNOWN-NEXT: global_store_b32 v0, v1, s[2:3]
; CHECK-UNKNOWN-NEXT: s_endpgm
-; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
-; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
-; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
;
; CHECK-MESA3D-LABEL: test_cluster_id_z:
; CHECK-MESA3D: .amd_kernel_code_t
@@ -428,7 +414,7 @@ define amdgpu_kernel void @test_cluster_id_z(ptr addrspace(1) %out) #1 {
; CHECK-MESA3D-NEXT: is_ptr64 = 1
; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -460,16 +446,14 @@ define amdgpu_kernel void @test_cluster_id_z(ptr addrspace(1) %out) #1 {
;
; CHECK-G-UNKNOWN-LABEL: test_cluster_id_z:
; CHECK-G-UNKNOWN: ; %bb.0:
-; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; CHECK-G-UNKNOWN-NEXT: s_lshr_b32 s2, ttmp7, 16
+; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; CHECK-G-UNKNOWN-NEXT: s_wait_xcnt 0x0
+; CHECK-G-UNKNOWN-NEXT: s_lshr_b32 s0, ttmp7, 16
; CHECK-G-UNKNOWN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-G-UNKNOWN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-UNKNOWN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
; CHECK-G-UNKNOWN-NEXT: s_wait_kmcnt 0x0
-; CHECK-G-UNKNOWN-NEXT: global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT: global_store_b32 v1, v0, s[2:3]
; CHECK-G-UNKNOWN-NEXT: s_endpgm
-; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
-; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
-; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
;
; CHECK-G-MESA3D-LABEL: test_cluster_id_z:
; CHECK-G-MESA3D: .amd_kernel_code_t
@@ -519,7 +503,7 @@ define amdgpu_kernel void @test_cluster_id_z(ptr addrspace(1) %out) #1 {
; CHECK-G-MESA3D-NEXT: is_ptr64 = 1
; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll
index aa3b7b3..3ef84a3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll
@@ -67,7 +67,7 @@ define amdgpu_kernel void @test_workgroup_id_x(ptr addrspace(1) %out) #1 {
; CHECK-MESA3D-NEXT: is_ptr64 = 1
; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -155,7 +155,7 @@ define amdgpu_kernel void @test_workgroup_id_x(ptr addrspace(1) %out) #1 {
; CHECK-G-MESA3D-NEXT: is_ptr64 = 1
; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -246,7 +246,7 @@ define amdgpu_kernel void @test_workgroup_id_x_optimized(ptr addrspace(1) %out)
; CHECK-MESA3D-NEXT: is_ptr64 = 1
; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -330,7 +330,7 @@ define amdgpu_kernel void @test_workgroup_id_x_optimized(ptr addrspace(1) %out)
; CHECK-G-MESA3D-NEXT: is_ptr64 = 1
; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -421,7 +421,7 @@ define amdgpu_kernel void @test_workgroup_id_y(ptr addrspace(1) %out) #1 {
; CHECK-MESA3D-NEXT: is_ptr64 = 1
; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -509,7 +509,7 @@ define amdgpu_kernel void @test_workgroup_id_y(ptr addrspace(1) %out) #1 {
; CHECK-G-MESA3D-NEXT: is_ptr64 = 1
; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -600,7 +600,7 @@ define amdgpu_kernel void @test_workgroup_id_y_optimized(ptr addrspace(1) %out)
; CHECK-MESA3D-NEXT: is_ptr64 = 1
; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -684,7 +684,7 @@ define amdgpu_kernel void @test_workgroup_id_y_optimized(ptr addrspace(1) %out)
; CHECK-G-MESA3D-NEXT: is_ptr64 = 1
; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -775,7 +775,7 @@ define amdgpu_kernel void @test_workgroup_id_z(ptr addrspace(1) %out) #1 {
; CHECK-MESA3D-NEXT: is_ptr64 = 1
; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -863,7 +863,7 @@ define amdgpu_kernel void @test_workgroup_id_z(ptr addrspace(1) %out) #1 {
; CHECK-G-MESA3D-NEXT: is_ptr64 = 1
; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -956,7 +956,7 @@ define amdgpu_kernel void @test_workgroup_flat_id(ptr addrspace(1) %out) {
; CHECK-MESA3D-NEXT: is_ptr64 = 1
; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -1044,7 +1044,7 @@ define amdgpu_kernel void @test_workgroup_flat_id(ptr addrspace(1) %out) {
; CHECK-G-MESA3D-NEXT: is_ptr64 = 1
; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -1135,7 +1135,7 @@ define amdgpu_kernel void @test_workgroup_id_z_optimized(ptr addrspace(1) %out)
; CHECK-MESA3D-NEXT: is_ptr64 = 1
; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -1219,7 +1219,7 @@ define amdgpu_kernel void @test_workgroup_id_z_optimized(ptr addrspace(1) %out)
; CHECK-G-MESA3D-NEXT: is_ptr64 = 1
; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll
index afe37e3..b8ff9e5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll
@@ -65,7 +65,7 @@ define amdgpu_kernel void @test_workgroup_max_flat_id(ptr addrspace(1) %out) #1
; CHECK-MESA3D-NEXT: is_ptr64 = 1
; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -153,7 +153,7 @@ define amdgpu_kernel void @test_workgroup_max_flat_id(ptr addrspace(1) %out) #1
; CHECK-G-MESA3D-NEXT: is_ptr64 = 1
; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll
index 7ea4fa5..9bca696 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll
@@ -67,7 +67,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x(ptr addrspace(1) %out) #1 {
; CHECK-MESA3D-NEXT: is_ptr64 = 1
; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -155,7 +155,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x(ptr addrspace(1) %out) #1 {
; CHECK-G-MESA3D-NEXT: is_ptr64 = 1
; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -246,7 +246,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x_optimized(ptr addrspace(1) %o
; CHECK-MESA3D-NEXT: is_ptr64 = 1
; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -330,7 +330,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x_optimized(ptr addrspace(1) %o
; CHECK-G-MESA3D-NEXT: is_ptr64 = 1
; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -421,7 +421,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y(ptr addrspace(1) %out) #1 {
; CHECK-MESA3D-NEXT: is_ptr64 = 1
; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -509,7 +509,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y(ptr addrspace(1) %out) #1 {
; CHECK-G-MESA3D-NEXT: is_ptr64 = 1
; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -600,7 +600,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y_optimized(ptr addrspace(1) %o
; CHECK-MESA3D-NEXT: is_ptr64 = 1
; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -684,7 +684,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y_optimized(ptr addrspace(1) %o
; CHECK-G-MESA3D-NEXT: is_ptr64 = 1
; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -775,7 +775,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z(ptr addrspace(1) %out) #1 {
; CHECK-MESA3D-NEXT: is_ptr64 = 1
; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -863,7 +863,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z(ptr addrspace(1) %out) #1 {
; CHECK-G-MESA3D-NEXT: is_ptr64 = 1
; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -954,7 +954,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z_optimized(ptr addrspace(1) %o
; CHECK-MESA3D-NEXT: is_ptr64 = 1
; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0
@@ -1038,7 +1038,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z_optimized(ptr addrspace(1) %o
; CHECK-G-MESA3D-NEXT: is_ptr64 = 1
; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0
; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1
; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0
; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
index 56215ca..67d0410 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
@@ -59,21 +59,20 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
; GFX9-NEXT: global_store_dword v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
-; GFX1250-SDAG-LABEL: is_private_vgpr:
-; GFX1250-SDAG: ; %bb.0:
-; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset scope:SCOPE_SYS
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v1
-; GFX1250-SDAG-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x4000000, v0
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX1250-SDAG-NEXT: global_store_b32 v[0:1], v0, off
-; GFX1250-SDAG-NEXT: s_endpgm
+; GFX1250-LABEL: is_private_vgpr:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x4000000, v0
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX1250-NEXT: global_store_b32 v[0:1], v0, off
+; GFX1250-NEXT: s_endpgm
;
; CI-GISEL-LABEL: is_private_vgpr:
; CI-GISEL: ; %bb.0:
@@ -122,22 +121,6 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-NEXT: global_store_b32 v[0:1], v0, off
; GFX11-NEXT: s_endpgm
-;
-; GFX1250-GISEL-LABEL: is_private_vgpr:
-; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset scope:SCOPE_SYS
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v1, v2
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x4000000, v0
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX1250-GISEL-NEXT: global_store_b32 v[0:1], v0, off
-; GFX1250-GISEL-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds ptr, ptr addrspace(1) %ptr.ptr, i32 %id
%ptr = load volatile ptr, ptr addrspace(1) %gep
@@ -206,9 +189,8 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
; GFX1250-SDAG-LABEL: is_private_sgpr:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: s_xor_b32 s0, s0, s1
+; GFX1250-SDAG-NEXT: s_xor_b32 s0, s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s0, 0x4000000
; GFX1250-SDAG-NEXT: s_cselect_b32 s0, -1, 0
@@ -285,9 +267,8 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-GISEL-NEXT: s_xor_b32 s0, s1, s0
+; GFX1250-GISEL-NEXT: s_xor_b32 s0, s1, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s0, 0x4000000
; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB1_2
; GFX1250-GISEL-NEXT: ; %bb.1: ; %bb0
@@ -311,5 +292,4 @@ bb1:
; CI: {{.*}}
; GFX10-GISEL: {{.*}}
; GFX11-GISEL: {{.*}}
-; GFX1250: {{.*}}
; SI-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll
index 4f7bbf8..42a50bb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll
@@ -5,13 +5,13 @@
define amdgpu_kernel void @v_permlane_bcast_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
; GFX1250-LABEL: v_permlane_bcast_b32_vss:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x34
+; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, s3, s4
+; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, s3, s6
; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1250-NEXT: s_endpgm
%v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 %src1, i32 %src2)
@@ -92,13 +92,13 @@ define amdgpu_kernel void @v_permlane_bcast_b32_vvv(ptr addrspace(1) %out, i32 %
define amdgpu_kernel void @v_permlane_down_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
; GFX1250-LABEL: v_permlane_down_b32_vss:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x34
+; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_permlane_down_b32 v0, v0, s3, s4
+; GFX1250-NEXT: v_permlane_down_b32 v0, v0, s3, s6
; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1250-NEXT: s_endpgm
%v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 %src1, i32 %src2)
@@ -179,13 +179,13 @@ define amdgpu_kernel void @v_permlane_down_b32_vvv(ptr addrspace(1) %out, i32 %s
define amdgpu_kernel void @v_permlane_up_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
; GFX1250-LABEL: v_permlane_up_b32_vss:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x34
+; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_permlane_up_b32 v0, v0, s3, s4
+; GFX1250-NEXT: v_permlane_up_b32 v0, v0, s3, s6
; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1250-NEXT: s_endpgm
%v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 %src1, i32 %src2)
@@ -266,13 +266,13 @@ define amdgpu_kernel void @v_permlane_up_b32_vvv(ptr addrspace(1) %out, i32 %src
define amdgpu_kernel void @v_permlane_xor_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
; GFX1250-LABEL: v_permlane_xor_b32_vss:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x34
+; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, s3, s4
+; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, s3, s6
; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1250-NEXT: s_endpgm
%v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 %src1, i32 %src2)
diff --git a/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir b/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir
index 76e2092..abcae69 100644
--- a/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir
+++ b/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir
@@ -69,9 +69,9 @@ body: |
bb.0:
; GCN-LABEL: name: merge_s_load_x1_x1_imm_no_scale_offset
; GCN: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
- ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
- ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0
- ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
+ ; GCN-NEXT: early-clobber %3:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY %3.sub0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub1
%0:sgpr_64 = IMPLICIT_DEF
%1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
%2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s32))
diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
index 1e6b77e..4ad161c 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
@@ -471,13 +471,13 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r
; GFX1250-NEXT: s_cmp_eq_u32 s0, 0
; GFX1250-NEXT: s_cbranch_scc1 .LBB4_3
; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader
-; GFX1250-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
+; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 4, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[6:7], v[0:1]
-; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[4:5], v[0:1]
+; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[10:11], v[0:1]
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[8:9], v[0:1]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 0xb0, v[2:3]
; GFX1250-NEXT: .LBB4_2: ; %for.body
@@ -602,13 +602,13 @@ define amdgpu_kernel void @copy_global_divergent(ptr addrspace(1) nocapture %d,
; GFX1250-NEXT: s_cmp_eq_u32 s0, 0
; GFX1250-NEXT: s_cbranch_scc1 .LBB5_3
; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader
-; GFX1250-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
+; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 4, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[6:7], v[0:1]
-; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[4:5], v[0:1]
+; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[10:11], v[0:1]
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[8:9], v[0:1]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 0xb0, v[2:3]
; GFX1250-NEXT: .LBB5_2: ; %for.body
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index dbcd370..08ec0c8 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -1117,18 +1117,19 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
;
; GFX1250-LABEL: mad_i64_i32_uniform:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1250-NEXT: s_mov_b32 s7, 0
+; GFX1250-NEXT: s_mov_b32 s5, 0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s6, s2
+; GFX1250-NEXT: s_mov_b32 s4, s2
; GFX1250-NEXT: s_mov_b32 s2, s3
-; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s3, s5
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT: s_mul_u64 s[2:3], s[6:7], s[2:3]
-; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX1250-NEXT: s_mul_u64 s[2:3], s[4:5], s[2:3]
+; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[6:7]
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
index 61f2629..c19d5a6 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
@@ -33,7 +33,7 @@ name: asm_write_vgpr_accvgpr_write_read
body: |
bb.0:
- INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def $vgpr0
+ INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0
$agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
...
@@ -47,7 +47,7 @@ name: asm_write_vgpr_accvgpr_write_read_partialnop
body: |
bb.0:
- INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def $vgpr0
+ INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0
S_NOP 0
$agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
...
@@ -60,7 +60,7 @@ name: asm_write_vgpr_accvgpr_write_read_otherreg
body: |
bb.0:
liveins: $vgpr0
- INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def $vgpr1
+ INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr1
$agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
...
diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll
index fef9a9a..ae08054 100644
--- a/llvm/test/CodeGen/AMDGPU/max.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.ll
@@ -257,16 +257,15 @@ define amdgpu_kernel void @v_test_imax_sge_i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX1250-LABEL: v_test_imax_sge_i8:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_load_i8 s2, s[2:3], 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_i8 s3, s[4:5], 0x0
+; GFX1250-NEXT: s_load_i8 s4, s[2:3], 0x0
+; GFX1250-NEXT: s_load_i8 s5, s[6:7], 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_max_i32 s2, s2, s3
+; GFX1250-NEXT: s_max_i32 s2, s4, s5
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
; GFX1250-NEXT: global_store_b8 v0, v1, s[0:1]
@@ -701,16 +700,15 @@ define amdgpu_kernel void @v_test_umax_uge_i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX1250-LABEL: v_test_umax_uge_i8:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_u8 s3, s[4:5], 0x0
+; GFX1250-NEXT: s_load_u8 s4, s[2:3], 0x0
+; GFX1250-NEXT: s_load_u8 s5, s[6:7], 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_max_u32 s2, s2, s3
+; GFX1250-NEXT: s_max_u32 s2, s4, s5
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
; GFX1250-NEXT: global_store_b8 v0, v1, s[0:1]
@@ -777,13 +775,12 @@ define amdgpu_kernel void @v_test_umax_ugt_i32(ptr addrspace(1) %out, ptr addrsp
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset
-; GFX1250-NEXT: s_load_b32 s2, s[0:1], 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: s_load_b32 s6, s[0:1], 0x0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_max_u32_e32 v0, s2, v0
-; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1250-NEXT: v_max_u32_e32 v0, s6, v0
+; GFX1250-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1250-NEXT: s_endpgm
;
; EG-LABEL: v_test_umax_ugt_i32:
@@ -1122,12 +1119,12 @@ define amdgpu_kernel void @test_umax_ugt_i64(ptr addrspace(1) %out, i64 %a, i64
;
; GFX1250-LABEL: test_umax_ugt_i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_max_u64 v[0:1], s[2:3], s[4:5]
+; GFX1250-NEXT: v_max_u64 v[0:1], s[2:3], s[6:7]
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1250-NEXT: s_endpgm
;
@@ -1175,12 +1172,12 @@ define amdgpu_kernel void @test_umax_uge_i64(ptr addrspace(1) %out, i64 %a, i64
;
; GFX1250-LABEL: test_umax_uge_i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_max_u64 v[0:1], s[2:3], s[4:5]
+; GFX1250-NEXT: v_max_u64 v[0:1], s[2:3], s[6:7]
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1250-NEXT: s_endpgm
;
@@ -1228,12 +1225,12 @@ define amdgpu_kernel void @test_imax_sgt_i64(ptr addrspace(1) %out, i64 %a, i64
;
; GFX1250-LABEL: test_imax_sgt_i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_max_i64 v[0:1], s[2:3], s[4:5]
+; GFX1250-NEXT: v_max_i64 v[0:1], s[2:3], s[6:7]
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1250-NEXT: s_endpgm
;
@@ -1281,12 +1278,12 @@ define amdgpu_kernel void @test_imax_sge_i64(ptr addrspace(1) %out, i64 %a, i64
;
; GFX1250-LABEL: test_imax_sge_i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_max_i64 v[0:1], s[2:3], s[4:5]
+; GFX1250-NEXT: v_max_i64 v[0:1], s[2:3], s[6:7]
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1250-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 311527d..6a3d31f 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -131,14 +131,14 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp
;
; GFX1250-LABEL: v_test_imin_sle_i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset
-; GFX1250-NEXT: global_load_b32 v2, v0, s[4:5] scale_offset
+; GFX1250-NEXT: global_load_b32 v2, v0, s[6:7] scale_offset
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_min_i32_e32 v1, v1, v2
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset
@@ -1172,14 +1172,14 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
;
; GFX1250-LABEL: s_test_imin_sle_v4i16:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_pk_min_i16 v1, s1, s3
; GFX1250-NEXT: v_pk_min_i16 v0, s0, s2
-; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX1250-NEXT: s_endpgm
%cmp = icmp sle <4 x i16> %a, %b
%val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
@@ -1307,14 +1307,14 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp
;
; GFX1250-LABEL: v_test_imin_slt_i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset
-; GFX1250-NEXT: global_load_b32 v2, v0, s[4:5] scale_offset
+; GFX1250-NEXT: global_load_b32 v2, v0, s[6:7] scale_offset
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_min_i32_e32 v1, v1, v2
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset
@@ -1484,14 +1484,14 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp
;
; GFX1250-LABEL: v_test_imin_slt_i16:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_u16 v1, v0, s[2:3] scale_offset
-; GFX1250-NEXT: global_load_u16 v2, v0, s[4:5] scale_offset
+; GFX1250-NEXT: global_load_u16 v2, v0, s[6:7] scale_offset
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_min_i16 v1, v1, v2
; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] scale_offset
@@ -1686,16 +1686,16 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
;
; GFX1250-LABEL: s_test_imin_slt_v2i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_min_i32 s0, s0, s2
; GFX1250-NEXT: s_min_i32 s1, s1, s3
; GFX1250-NEXT: v_mov_b32_e32 v0, s0
; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX1250-NEXT: s_endpgm
%cmp = icmp slt <2 x i32> %a, %b
%val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
@@ -2011,14 +2011,14 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp
;
; GFX1250-LABEL: v_test_umin_ule_i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset
-; GFX1250-NEXT: global_load_b32 v2, v0, s[4:5] scale_offset
+; GFX1250-NEXT: global_load_b32 v2, v0, s[6:7] scale_offset
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_min_u32_e32 v1, v1, v2
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset
@@ -2171,16 +2171,16 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr
;
; GFX1250-LABEL: v_test_umin_ule_v3i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 4, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_b96 v[0:2], v3, s[2:3]
-; GFX1250-NEXT: global_load_b96 v[4:6], v3, s[4:5]
+; GFX1250-NEXT: global_load_b96 v[4:6], v3, s[6:7]
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_min_u32_e32 v2, v2, v6
; GFX1250-NEXT: v_min_u32_e32 v1, v1, v5
@@ -2374,14 +2374,14 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr
;
; GFX1250-LABEL: v_test_umin_ule_v3i16:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[2:3] scale_offset
-; GFX1250-NEXT: global_load_b64 v[2:3], v4, s[4:5] scale_offset
+; GFX1250-NEXT: global_load_b64 v[2:3], v4, s[6:7] scale_offset
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX1250-NEXT: s_wait_loadcnt 0x0
@@ -2611,14 +2611,14 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp
;
; GFX1250-LABEL: v_test_umin_ult_i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset
-; GFX1250-NEXT: global_load_b32 v2, v0, s[4:5] scale_offset
+; GFX1250-NEXT: global_load_b32 v2, v0, s[6:7] scale_offset
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_min_u32_e32 v1, v1, v2
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset
@@ -2771,14 +2771,14 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX1250-LABEL: v_test_umin_ult_i8:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_u8 v1, v0, s[2:3]
-; GFX1250-NEXT: global_load_u8 v2, v0, s[4:5]
+; GFX1250-NEXT: global_load_u8 v2, v0, s[6:7]
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_min_u16 v1, v1, v2
; GFX1250-NEXT: global_store_b8 v0, v1, s[0:1]
@@ -3023,23 +3023,22 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
;
; GFX1250-LABEL: v_test_umin_ult_i32_multi_use:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b32 s5, s[6:7], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[12:13], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[14:15], 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_cmp_lt_u32 s4, s5
-; GFX1250-NEXT: s_cselect_b32 s6, -1, 0
+; GFX1250-NEXT: s_cmp_lt_u32 s0, s1
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6
-; GFX1250-NEXT: s_and_b32 s6, s6, exec_lo
-; GFX1250-NEXT: s_cselect_b32 s4, s4, s5
-; GFX1250-NEXT: v_mov_b32_e32 v2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX1250-NEXT: s_and_b32 s2, s2, exec_lo
+; GFX1250-NEXT: s_cselect_b32 s0, s0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
; GFX1250-NEXT: s_clause 0x1
-; GFX1250-NEXT: global_store_b32 v1, v2, s[0:1]
-; GFX1250-NEXT: global_store_b8 v1, v0, s[2:3]
+; GFX1250-NEXT: global_store_b32 v1, v2, s[8:9]
+; GFX1250-NEXT: global_store_b8 v1, v0, s[10:11]
; GFX1250-NEXT: s_endpgm
%a = load i32, ptr addrspace(1) %aptr, align 4
%b = load i32, ptr addrspace(1) %bptr, align 4
@@ -3220,12 +3219,12 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
;
; GFX1250-LABEL: v_test_umin_ult_i16_multi_use:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
-; GFX1250-NEXT: global_load_u16 v1, v0, s[6:7]
-; GFX1250-NEXT: global_load_u16 v2, v0, s[4:5]
+; GFX1250-NEXT: global_load_u16 v1, v0, s[14:15]
+; GFX1250-NEXT: global_load_u16 v2, v0, s[12:13]
; GFX1250-NEXT: s_wait_loadcnt 0x1
; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff, v1
; GFX1250-NEXT: s_wait_loadcnt 0x0
@@ -3235,8 +3234,8 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
; GFX1250-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
; GFX1250-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX1250-NEXT: s_clause 0x1
-; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX1250-NEXT: global_store_b16 v0, v1, s[8:9]
+; GFX1250-NEXT: global_store_b8 v0, v2, s[10:11]
; GFX1250-NEXT: s_endpgm
%a = load i16, ptr addrspace(1) %aptr, align 2
%b = load i16, ptr addrspace(1) %bptr, align 2
@@ -4338,12 +4337,12 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64
;
; GFX1250-LABEL: test_umin_ult_i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_min_u64 v[0:1], s[2:3], s[4:5]
+; GFX1250-NEXT: v_min_u64 v[0:1], s[2:3], s[6:7]
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1250-NEXT: s_endpgm
%tmp = icmp ult i64 %a, %b
@@ -4462,12 +4461,12 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64
;
; GFX1250-LABEL: test_umin_ule_i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_min_u64 v[0:1], s[2:3], s[4:5]
+; GFX1250-NEXT: v_min_u64 v[0:1], s[2:3], s[6:7]
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1250-NEXT: s_endpgm
%tmp = icmp ule i64 %a, %b
@@ -4586,12 +4585,12 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64
;
; GFX1250-LABEL: test_imin_slt_i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_min_i64 v[0:1], s[2:3], s[4:5]
+; GFX1250-NEXT: v_min_i64 v[0:1], s[2:3], s[6:7]
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1250-NEXT: s_endpgm
%tmp = icmp slt i64 %a, %b
@@ -4710,12 +4709,12 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64
;
; GFX1250-LABEL: test_imin_sle_i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_min_i64 v[0:1], s[2:3], s[4:5]
+; GFX1250-NEXT: v_min_i64 v[0:1], s[2:3], s[6:7]
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1250-NEXT: s_endpgm
%tmp = icmp sle i64 %a, %b
@@ -4872,14 +4871,14 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr
;
; GFX1250-LABEL: v_test_imin_sle_v2i16:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset
-; GFX1250-NEXT: global_load_b32 v2, v0, s[4:5] scale_offset
+; GFX1250-NEXT: global_load_b32 v2, v0, s[6:7] scale_offset
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_pk_min_i16 v1, v1, v2
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset
@@ -5042,14 +5041,14 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr
;
; GFX1250-LABEL: v_test_imin_ule_v2i16:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset
-; GFX1250-NEXT: global_load_b32 v2, v0, s[4:5] scale_offset
+; GFX1250-NEXT: global_load_b32 v2, v0, s[6:7] scale_offset
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_pk_min_u16 v1, v1, v2
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index baccb4c..d29847e 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -450,6 +450,7 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a,
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x34
+; GFX1250-NEXT: ; kill: killed $sgpr4_sgpr5
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_mul_i32 s2, s3, s2
; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
@@ -613,25 +614,25 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add
;
; GFX1250-LABEL: v_trunc_i64_mul_to_i32:
; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1250-NEXT: s_mov_b32 s10, -1
-; GFX1250-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1250-NEXT: s_mov_b32 s14, s10
-; GFX1250-NEXT: s_mov_b32 s15, s11
-; GFX1250-NEXT: s_mov_b32 s6, s10
-; GFX1250-NEXT: s_mov_b32 s7, s11
+; GFX1250-NEXT: s_load_b64 s[8:9], s[4:5], 0x34
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s14, s6
+; GFX1250-NEXT: s_mov_b32 s15, s7
+; GFX1250-NEXT: s_mov_b32 s10, s6
+; GFX1250-NEXT: s_mov_b32 s11, s7
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_mov_b32 s12, s2
; GFX1250-NEXT: s_mov_b32 s13, s3
; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null
-; GFX1250-NEXT: buffer_load_b32 v1, off, s[4:7], null
-; GFX1250-NEXT: s_mov_b32 s8, s0
-; GFX1250-NEXT: s_mov_b32 s9, s1
+; GFX1250-NEXT: buffer_load_b32 v1, off, s[8:11], null
+; GFX1250-NEXT: s_mov_b32 s4, s0
+; GFX1250-NEXT: s_mov_b32 s5, s1
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mul_lo_u32 v0, v1, v0
-; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1250-NEXT: s_endpgm
;
; EG-LABEL: v_trunc_i64_mul_to_i32:
@@ -2091,11 +2092,11 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun
;
; GFX1250-LABEL: s_mul_i64:
; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_mul_u64 s[4:5], s[2:3], s[4:5]
+; GFX1250-NEXT: s_mul_u64 s[4:5], s[2:3], s[6:7]
; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX1250-NEXT: s_mov_b32 s2, -1
@@ -2292,25 +2293,25 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
;
; GFX1250-LABEL: v_mul_i64:
; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1250-NEXT: s_mov_b32 s10, -1
-; GFX1250-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1250-NEXT: s_mov_b32 s14, s10
-; GFX1250-NEXT: s_mov_b32 s15, s11
-; GFX1250-NEXT: s_mov_b32 s6, s10
-; GFX1250-NEXT: s_mov_b32 s7, s11
+; GFX1250-NEXT: s_load_b64 s[8:9], s[4:5], 0x34
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s14, s6
+; GFX1250-NEXT: s_mov_b32 s15, s7
+; GFX1250-NEXT: s_mov_b32 s10, s6
+; GFX1250-NEXT: s_mov_b32 s11, s7
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_mov_b32 s12, s2
; GFX1250-NEXT: s_mov_b32 s13, s3
; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null
-; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[4:7], null
-; GFX1250-NEXT: s_mov_b32 s8, s0
-; GFX1250-NEXT: s_mov_b32 s9, s1
+; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[8:11], null
+; GFX1250-NEXT: s_mov_b32 s4, s0
+; GFX1250-NEXT: s_mov_b32 s5, s1
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mul_u64_e32 v[0:1], v[0:1], v[2:3]
-; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX1250-NEXT: s_endpgm
;
; EG-LABEL: v_mul_i64:
@@ -2845,30 +2846,30 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1250-LABEL: mul64_in_branch:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1250-NEXT: s_cmp_lg_u64 s[12:13], 0
; GFX1250-NEXT: s_cbranch_scc0 .LBB16_3
; GFX1250-NEXT: ; %bb.1: ; %else
-; GFX1250-NEXT: s_mul_u64 s[4:5], s[4:5], s[6:7]
+; GFX1250-NEXT: s_mul_u64 s[0:1], s[12:13], s[14:15]
; GFX1250-NEXT: s_cbranch_execnz .LBB16_4
; GFX1250-NEXT: .LBB16_2: ; %if
-; GFX1250-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1250-NEXT: s_mov_b32 s6, -1
-; GFX1250-NEXT: s_mov_b32 s4, s2
-; GFX1250-NEXT: s_mov_b32 s5, s3
-; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_mov_b32 s0, s10
+; GFX1250-NEXT: s_mov_b32 s1, s11
+; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[0:3], null
; GFX1250-NEXT: s_branch .LBB16_5
; GFX1250-NEXT: .LBB16_3:
-; GFX1250-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX1250-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX1250-NEXT: s_branch .LBB16_2
; GFX1250-NEXT: .LBB16_4:
-; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX1250-NEXT: .LBB16_5: ; %endif
-; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s10, -1
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
; GFX1250-NEXT: s_endpgm
;
; EG-LABEL: mul64_in_branch:
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index b0651ef..78207c2 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -340,46 +340,46 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
;
; GFX1250-SDAG-LABEL: fadd_v32_vs:
; GFX1250-SDAG: ; %bb.0:
-; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[34:35] offset:16
-; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:48
-; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
-; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[34:35]
-; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:80
-; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:96
-; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:64
-; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
-; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16
+; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48
+; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32
+; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[0:1]
+; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:80
+; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96
+; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64
+; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
+; GFX1250-SDAG-NEXT: s_clause 0x1
+; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v33, s21
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s22 :: v_dual_mov_b32 v35, s23
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s18 :: v_dual_mov_b32 v39, s29
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s30 :: v_dual_mov_b32 v41, s31
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s24 :: v_dual_mov_b32 v37, s19
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s28 :: v_dual_mov_b32 v55, s15
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s3 :: v_dual_mov_b32 v52, s12
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s13 :: v_dual_mov_b32 v54, s14
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s7 :: v_dual_mov_b32 v50, s2
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s27 :: v_dual_mov_b32 v46, s4
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s5 :: v_dual_mov_b32 v48, s6
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s25 :: v_dual_mov_b32 v44, s26
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33]
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35]
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s8 :: v_dual_mov_b32 v33, s9
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s10 :: v_dual_mov_b32 v35, s11
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s16 :: v_dual_mov_b32 v33, s17
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s18 :: v_dual_mov_b32 v35, s19
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[40:41]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[0:1]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[38:39]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[16:17]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[36:37]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[20:21], v[20:21], v[32:33]
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[22:23], v[22:23], v[34:35]
@@ -395,58 +395,58 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[14:15], v[14:15], v[36:37]
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[38:39]
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:96
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:64
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:80
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:48
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[34:35]
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[34:35] offset:16
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:96
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:64
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:80
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:48
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[0:1]
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[0:1] offset:16
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fadd_v32_vs:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: s_clause 0x7
-; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35]
-; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16
-; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
-; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48
-; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64
-; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80
-; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96
-; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
-; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[0:1]
+; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:16
+; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32
+; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[0:1] offset:48
+; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:64
+; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80
+; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96
+; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
+; GFX1250-GISEL-NEXT: s_clause 0x1
+; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[8:9]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[10:11]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[12:13]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[14:15]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33]
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[36:37]
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[38:39]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[40:41]
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[42:43]
@@ -466,14 +466,14 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[28:29], v[28:29], v[36:37]
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[30:31], v[30:31], v[38:39]
; GFX1250-GISEL-NEXT: s_clause 0x7
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35]
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[34:35] offset:48
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:64
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:80
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:96
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[0:1]
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:16
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[0:1] offset:48
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:64
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:80
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:96
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
@@ -1597,46 +1597,46 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
;
; GFX1250-SDAG-LABEL: fmul_v32_vs:
; GFX1250-SDAG: ; %bb.0:
-; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[34:35] offset:16
-; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:48
-; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
-; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[34:35]
-; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:80
-; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:96
-; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:64
-; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
-; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16
+; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48
+; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32
+; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[0:1]
+; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:80
+; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96
+; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64
+; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
+; GFX1250-SDAG-NEXT: s_clause 0x1
+; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v33, s21
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s22 :: v_dual_mov_b32 v35, s23
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s18 :: v_dual_mov_b32 v39, s29
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s30 :: v_dual_mov_b32 v41, s31
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s24 :: v_dual_mov_b32 v37, s19
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s28 :: v_dual_mov_b32 v55, s15
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s3 :: v_dual_mov_b32 v52, s12
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s13 :: v_dual_mov_b32 v54, s14
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s7 :: v_dual_mov_b32 v50, s2
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s27 :: v_dual_mov_b32 v46, s4
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s5 :: v_dual_mov_b32 v48, s6
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s25 :: v_dual_mov_b32 v44, s26
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33]
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35]
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s8 :: v_dual_mov_b32 v33, s9
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s10 :: v_dual_mov_b32 v35, s11
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s16 :: v_dual_mov_b32 v33, s17
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s18 :: v_dual_mov_b32 v35, s19
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[40:41]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[0:1]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[38:39]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[16:17]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[36:37]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[32:33]
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[34:35]
@@ -1652,58 +1652,58 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[36:37]
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[38:39]
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:96
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:64
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:80
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:48
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[34:35]
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[34:35] offset:16
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:96
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:64
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:80
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:48
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[0:1]
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[0:1] offset:16
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fmul_v32_vs:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: s_clause 0x7
-; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35]
-; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16
-; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
-; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48
-; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64
-; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80
-; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96
-; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
-; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[0:1]
+; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:16
+; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32
+; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[0:1] offset:48
+; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:64
+; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80
+; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96
+; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
+; GFX1250-GISEL-NEXT: s_clause 0x1
+; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[8:9]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[10:11]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[12:13]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[14:15]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33]
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[36:37]
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[38:39]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[40:41]
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[42:43]
@@ -1723,14 +1723,14 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[36:37]
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[38:39]
; GFX1250-GISEL-NEXT: s_clause 0x7
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35]
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[34:35] offset:48
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:64
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:80
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:96
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[0:1]
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:16
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[0:1] offset:48
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:64
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:80
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:96
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
@@ -2428,46 +2428,46 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
;
; GFX1250-SDAG-LABEL: fma_v32_vs:
; GFX1250-SDAG: ; %bb.0:
-; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[34:35] offset:16
-; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:48
-; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
-; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[34:35]
-; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:80
-; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:96
-; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:64
-; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
-; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16
+; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48
+; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32
+; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[0:1]
+; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:80
+; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96
+; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64
+; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
+; GFX1250-SDAG-NEXT: s_clause 0x1
+; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[20:21]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[22:23]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[30:31]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[28:29]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[52:53], s[12:13]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[54:55], s[14:15]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[50:51], s[2:3]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[46:47], s[4:5]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[48:49], s[6:7]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[42:43], s[24:25]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[44:45], s[26:27]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[18:19]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[40:41]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[42:43]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[50:51]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[48:49]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[50:51], s[10:11]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[46:47], s[12:13]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[48:49], s[14:15]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[42:43], s[44:45]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[44:45], s[46:47]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[38:39]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[32:33], v[32:33]
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[34:35], v[34:35]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[8:9]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[10:11]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[16:17]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[18:19]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[6:7], v[6:7], v[40:41], v[40:41]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[0:1]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[4:5], v[4:5], v[38:39], v[38:39]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[16:17]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[36:37]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[28:29], v[28:29], v[52:53], v[52:53]
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[20:21], v[20:21], v[32:33], v[32:33]
@@ -2482,58 +2482,58 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[14:15], v[14:15], v[36:37], v[36:37]
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[12:13], v[12:13], v[38:39], v[38:39]
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:96
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:64
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:80
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:48
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[34:35]
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[34:35] offset:16
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:96
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:64
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:80
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:48
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[0:1]
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[0:1] offset:16
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fma_v32_vs:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: s_clause 0x7
-; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35]
-; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16
-; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
-; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48
-; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64
-; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80
-; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96
-; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
-; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[0:1]
+; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:16
+; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32
+; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[0:1] offset:48
+; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:64
+; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80
+; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96
+; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
+; GFX1250-GISEL-NEXT: s_clause 0x1
+; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[8:9]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[10:11]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[12:13]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[14:15]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[32:33], v[32:33]
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[34:35], v[34:35]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[4:5], v[4:5], v[36:37], v[36:37]
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[6:7], v[6:7], v[38:39], v[38:39]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[8:9], v[8:9], v[40:41], v[40:41]
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[10:11], v[10:11], v[42:43], v[42:43]
@@ -2553,14 +2553,14 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[28:29], v[28:29], v[36:37], v[36:37]
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[30:31], v[30:31], v[38:39], v[38:39]
; GFX1250-GISEL-NEXT: s_clause 0x7
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35]
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[34:35] offset:48
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:64
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:80
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:96
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[0:1]
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:16
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[0:1] offset:48
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:64
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:80
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:96
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
@@ -3529,9 +3529,9 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
;
; GFX1250-SDAG-LABEL: fadd_fadd_fsub:
; GFX1250-SDAG: ; %bb.0: ; %bb
+; GFX1250-SDAG-NEXT: s_clause 0x1
; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-SDAG-NEXT: s_add_f32 s2, s1, s3
@@ -3541,14 +3541,14 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, v2
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[4:5], v[0:1] neg_lo:[0,1] neg_hi:[0,1]
-; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fadd_fadd_fsub:
; GFX1250-GISEL: ; %bb.0: ; %bb
+; GFX1250-GISEL-NEXT: s_clause 0x1
; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -3560,7 +3560,7 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[2:3], v[0:1]
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_subrev_f32 v3, s3, v0
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-GISEL-NEXT: global_store_b64 v0, v[2:3], s[4:5]
+; GFX1250-GISEL-NEXT: global_store_b64 v0, v[2:3], s[6:7]
; GFX1250-GISEL-NEXT: s_endpgm
bb:
%i12 = fadd <2 x float> %arg, %arg1
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index 6509d80..f88b1bf 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -12,7 +12,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; REGALLOC-GFX908-NEXT: liveins: $sgpr4_sgpr5
; REGALLOC-GFX908-NEXT: {{ $}}
; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef %6:agpr_32
- ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7798794 /* regdef:VReg_128 */, def %25
+ ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128 */, def %25
; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:av_128 = COPY %25
; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3735562 /* regdef:VReg_64 */, def %27
; REGALLOC-GFX908-NEXT: SI_SPILL_AV64_SAVE %27, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
@@ -37,7 +37,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; PEI-GFX908-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
; PEI-GFX908-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
; PEI-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef renamable $agpr0
- ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7798794 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
+ ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3735562 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
@@ -61,7 +61,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; REGALLOC-GFX90A-NEXT: liveins: $sgpr4_sgpr5
; REGALLOC-GFX90A-NEXT: {{ $}}
; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef %6:agpr_32
- ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def %23
+ ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8257546 /* regdef:VReg_128_Align2 */, def %23
; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:av_128_align2 = COPY %23
; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3997706 /* regdef:VReg_64_Align2 */, def %21
; REGALLOC-GFX90A-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY %21
@@ -80,7 +80,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; PEI-GFX90A-NEXT: liveins: $sgpr4_sgpr5
; PEI-GFX90A-NEXT: {{ $}}
; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef renamable $agpr0
- ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
+ ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8257546 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3997706 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
index b717f85..6671201 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
@@ -186,12 +186,12 @@ define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32
;
; GFX1250-LABEL: mixed_inreg_block_count_x:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_load_b32 s2, s[0:1], 0x10
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b32 s4, s[0:1], 0x10
+; GFX1250-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index 4d367ef..c1764c9 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -346,10 +346,10 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad
;
; GFX1250-LABEL: byref_preload_arg:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x100
+; GFX1250-NEXT: s_load_b64 s[4:5], s[0:1], 0x100
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-NEXT: v_mov_b32_e32 v2, s5
; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_store_b32 v0, v2, s[2:3] scope:SCOPE_SYS
@@ -404,10 +404,10 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o
;
; GFX1250-LABEL: byref_staggered_preload_arg:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x100
+; GFX1250-NEXT: s_load_b64 s[4:5], s[0:1], 0x100
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-NEXT: v_mov_b32_e32 v2, s5
; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_store_b32 v0, v2, s[2:3] scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
index b32e997..80afe7a 100644
--- a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
@@ -43,17 +43,17 @@ machineFunctionInfo:
body: |
bb.0:
- INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def $agpr0
+ INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2424842 /* regdef:AGPR_32 */, implicit-def $agpr0
%14:vgpr_32 = COPY killed $agpr0
- INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 27394058 /* regdef:VReg_512 */, def %7, 13697034 /* regdef:VReg_256 */, def %8, 6225930 /* regdef:VReg_128 */, def %9, 4915210 /* regdef:VReg_96 */, def %10, 4915210 /* regdef:VReg_96 */, def %11
+ INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 40042506 /* regdef:VReg_512 */, def %7, 19464202 /* regdef:VReg_256 */, def %8, 7929866 /* regdef:VReg_128 */, def %9, 5963786 /* regdef:VReg_96 */, def %10, 5963786 /* regdef:VReg_96 */, def %11
INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 12 /* clobber */, implicit-def dead early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 27394057 /* reguse:VReg_512 */, %7
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 13697033 /* reguse:VReg_256 */, %8
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:VReg_128 */, %9
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4915209 /* reguse:VReg_96 */, %10
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4915209 /* reguse:VReg_96 */, %11
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40042505 /* reguse:VReg_512 */, %7
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 19464201 /* reguse:VReg_256 */, %8
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7929865 /* reguse:VReg_128 */, %9
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5963785 /* reguse:VReg_96 */, %10
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5963785 /* reguse:VReg_96 */, %11
$agpr1 = COPY %14
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9 /* reguse */, killed $agpr1
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, killed $agpr1
SI_RETURN
...
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir
index 1b09f5d..ad490f8 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir
@@ -41,9 +41,9 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]]
- ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[COPY3]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3801097 /* reguse:AV_64_Align2 */, [[V_MFMA_F64_4X4X4F64_e64_]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]]
+ ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]]:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[COPY3]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:VReg_64_Align2 */, [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]]
; CHECK-NEXT: SI_RETURN
%0:vreg_64_align2 = COPY $vgpr4_vgpr5
%1:av_64_align2 = COPY $vgpr0_vgpr1
@@ -51,7 +51,7 @@ body: |
%3:areg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
%4:vreg_128_align2 = COPY %3
%5:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3801097 /* reguse:VReg_64_Align2 */, %5
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:VReg_64_Align2 */, %5
SI_RETURN
...
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir
index d7b713a..0b4e662 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir
@@ -19,7 +19,7 @@ body: |
; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: SI_RETURN
@@ -30,7 +30,7 @@ body: |
%4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
undef %5.sub0_sub1:areg_128_align2 = COPY %4
%5.sub2_sub3 = IMPLICIT_DEF
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5
GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
SI_RETURN
@@ -172,7 +172,7 @@ body: |
; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]].sub2_sub3:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: SI_RETURN
@@ -183,7 +183,7 @@ body: |
undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
undef %5.sub0_sub1:areg_128_align2 = COPY %4.sub2_sub3
%5.sub2_sub3 = IMPLICIT_DEF
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5
GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
SI_RETURN
@@ -208,7 +208,7 @@ body: |
; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]].sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub2
; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: SI_RETURN
@@ -219,7 +219,7 @@ body: |
undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
undef %5.sub1:areg_128_align2 = COPY %4.sub2
%5.sub2_sub3 = IMPLICIT_DEF
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5
GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
SI_RETURN
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
index 57f611b..dcf3b8b 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
@@ -17,7 +17,7 @@ body: |
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: SI_RETURN
%0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -26,7 +26,7 @@ body: |
%3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
%4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
undef %5.sub0_sub1:areg_128_align2 = COPY %4
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5
GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
SI_RETURN
...
@@ -47,7 +47,7 @@ body: |
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: SI_RETURN
%0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -56,7 +56,7 @@ body: |
%3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
%4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec
undef %5.sub0_sub1:areg_128_align2 = COPY %4
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5
GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
SI_RETURN
...
@@ -79,7 +79,7 @@ body: |
; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub0_sub1
; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_1:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4521993 /* reguse:VS_64_with_sub0_in_VS_32_Lo128 */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY3]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
; CHECK-NEXT: SI_RETURN
%0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -90,7 +90,7 @@ body: |
%other_use:vreg_64_align2 = COPY %4.sub0_sub1
%5:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
%6:areg_64_align2 = COPY %5
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4521993 /* reguse:AReg_64_Align2 */, %6:areg_64_align2
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %6:areg_64_align2
GLOBAL_STORE_DWORDX2 %0, %6, 0, 0, implicit $exec :: (store (s64), addrspace 1)
SI_RETURN
...
@@ -114,7 +114,7 @@ body: |
; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_1:%[0-9]+]].sub0_sub1:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4521993 /* reguse:VS_64_with_sub0_in_VS_32_Lo128 */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY3]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
; CHECK-NEXT: SI_RETURN
%0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -126,7 +126,7 @@ body: |
undef %5.sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4, 0, 0, 0, implicit $mode, implicit $exec
%6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
%7:areg_64_align2 = COPY %6
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4521993 /* reguse:AReg_64_Align2 */, %7
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %7
GLOBAL_STORE_DWORDX2 %0, %7, 0, 0, implicit $exec :: (store (s64), addrspace 1)
SI_RETURN
@@ -151,7 +151,7 @@ body: |
; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1
; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: SI_RETURN
%0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -163,7 +163,7 @@ body: |
%other_use:vreg_64_align2 = COPY %5.sub0_sub1
%6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
undef %8.sub0_sub1:areg_128_align2 = COPY %6
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %8:areg_128_align2
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %8:areg_128_align2
GLOBAL_STORE_DWORDX4 %0, %8, 0, 0, implicit $exec :: (store (s128), addrspace 1)
SI_RETURN
@@ -231,7 +231,7 @@ body: |
; CHECK-NEXT: dead %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: SI_RETURN
%0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -245,7 +245,7 @@ body: |
%other_use1:vreg_64_align2 = COPY %4.sub2_sub3
%other_use2:vreg_64 = COPY %4.sub1_sub2
%6:areg_128_align2 = COPY %4
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %6:areg_128_align2
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %6:areg_128_align2
GLOBAL_STORE_DWORDX4 %0, %6, 0, 0, implicit $exec :: (store (s128), addrspace 1)
SI_RETURN
...
diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
index 335d58c..a18847b 100644
--- a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
@@ -324,11 +324,9 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
; SDAG-LABEL: flat_atomicrmw_b64_rtn_idxprom:
; SDAG: ; %bb.0: ; %entry
; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 3, s[0:1]
-; SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
-; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; SDAG-NEXT: v_xor_b32_e32 v0, s0, v3
+; SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v3
; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -350,10 +348,9 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
; SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; SDAG-NEXT: s_cbranch_execz .LBB21_2
; SDAG-NEXT: .LBB21_4: ; %atomicrmw.private
-; SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v2
+; SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -367,12 +364,12 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
;
; GISEL-LABEL: flat_atomicrmw_b64_rtn_idxprom:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v0, src_flat_scratch_base_hi
+; GISEL-NEXT: v_mov_b32_e32 v2, v0
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GISEL-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 3, s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-NEXT: v_xor_b32_e32 v0, v5, v0
+; GISEL-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
; GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GISEL-NEXT: s_and_saveexec_b32 s2, vcc_lo
@@ -394,11 +391,10 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
; GISEL-NEXT: s_and_not1_saveexec_b32 s0, s2
; GISEL-NEXT: s_cbranch_execz .LBB21_2
; GISEL-NEXT: .LBB21_4: ; %atomicrmw.private
-; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-NEXT: v_sub_nc_u32_e32 v0, v4, v0
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GISEL-NEXT: scratch_load_b64 v[0:1], v4, off
; GISEL-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll
index b5bb68e..e0ea08d 100644
--- a/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll
@@ -97,9 +97,9 @@ entry:
define amdgpu_ps <2 x float> @s_load_b64_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
; GCN-LABEL: s_load_b64_idxprom:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_load_b64 s[4:5], s[0:1], s2 offset:0x0 scale_offset
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = zext i32 %idx to i64
@@ -111,10 +111,10 @@ entry:
define amdgpu_ps <3 x float> @s_load_b96_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
; GCN-LABEL: s_load_b96_idxprom:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b96 s[0:2], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_load_b96 s[4:6], s[0:1], s2 offset:0x0 scale_offset
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = zext i32 %idx to i64
@@ -126,10 +126,10 @@ entry:
define amdgpu_ps <4 x float> @s_load_b128_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
; GCN-LABEL: s_load_b128_idxprom:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b128 s[0:3], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_load_b128 s[4:7], s[0:1], s2 offset:0x0 scale_offset
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GCN-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = zext i32 %idx to i64
@@ -141,12 +141,12 @@ entry:
define amdgpu_ps <8 x float> @s_load_b256_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
; GCN-LABEL: s_load_b256_idxprom:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b256 s[0:7], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_load_b256 s[4:11], s[0:1], s2 offset:0x0 scale_offset
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GCN-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GCN-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9
+; GCN-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = zext i32 %idx to i64
@@ -158,16 +158,16 @@ entry:
define amdgpu_ps <16 x float> @s_load_b512_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
; GCN-LABEL: s_load_b512_idxprom:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b512 s[0:15], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_load_b512 s[4:19], s[0:1], s2 offset:0x0 scale_offset
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GCN-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GCN-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GCN-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GCN-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15
+; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GCN-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GCN-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9
+; GCN-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11
+; GCN-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v9, s13
+; GCN-NEXT: v_dual_mov_b32 v10, s14 :: v_dual_mov_b32 v11, s15
+; GCN-NEXT: v_dual_mov_b32 v12, s16 :: v_dual_mov_b32 v13, s17
+; GCN-NEXT: v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v15, s19
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = zext i32 %idx to i64
@@ -275,11 +275,11 @@ entry:
define amdgpu_ps <2 x float> @s_load_b64_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
; GCN-LABEL: s_load_b64_idxprom_range:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_load_b32 s4, s[0:1], 0x0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], s4 offset:0x0 scale_offset
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr addrspace(4) %p, align 4, !range !0
@@ -294,10 +294,10 @@ define amdgpu_ps <3 x float> @s_load_b96_idxprom_range(ptr addrspace(4) align 4
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_load_b96 s[0:2], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_load_b96 s[4:6], s[0:1], s2 offset:0x0 scale_offset
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr addrspace(4) %p, align 4, !range !0
@@ -312,10 +312,10 @@ define amdgpu_ps <4 x float> @s_load_b128_idxprom_range(ptr addrspace(4) align 4
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_load_b128 s[0:3], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_load_b128 s[4:7], s[0:1], s2 offset:0x0 scale_offset
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GCN-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr addrspace(4) %p, align 4, !range !0
@@ -330,12 +330,12 @@ define amdgpu_ps <8 x float> @s_load_b256_idxprom_range(ptr addrspace(4) align 4
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_load_b256 s[0:7], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_load_b256 s[4:11], s[0:1], s2 offset:0x0 scale_offset
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GCN-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GCN-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9
+; GCN-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr addrspace(4) %p, align 4, !range !0
@@ -350,16 +350,16 @@ define amdgpu_ps <16 x float> @s_load_b512_idxprom_range(ptr addrspace(4) align
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_load_b512 s[0:15], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_load_b512 s[4:19], s[0:1], s2 offset:0x0 scale_offset
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GCN-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GCN-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GCN-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GCN-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15
+; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GCN-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GCN-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9
+; GCN-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11
+; GCN-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v9, s13
+; GCN-NEXT: v_dual_mov_b32 v10, s14 :: v_dual_mov_b32 v11, s15
+; GCN-NEXT: v_dual_mov_b32 v12, s16 :: v_dual_mov_b32 v13, s17
+; GCN-NEXT: v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v15, s19
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr addrspace(4) %p, align 4, !range !0
diff --git a/llvm/test/CodeGen/AMDGPU/true16-fold.mir b/llvm/test/CodeGen/AMDGPU/true16-fold.mir
index 93cc12f..9484417 100644
--- a/llvm/test/CodeGen/AMDGPU/true16-fold.mir
+++ b/llvm/test/CodeGen/AMDGPU/true16-fold.mir
@@ -57,6 +57,7 @@ body: |
%4:vgpr_16 = COPY %3:sgpr_lo16
%5:vgpr_32 = V_ALIGNBIT_B32_t16_e64 0, %0:sreg_32, 0, killed %1:sreg_32, 0, killed %4:vgpr_16, 0, 0, implicit $exec
S_ENDPGM 0, implicit %5
+...
---
name: fold_16bit_madmix_clamp
@@ -207,3 +208,27 @@ body: |
$vgpr0 = COPY %4
S_ENDPGM 0, implicit $vgpr0
...
+
+---
+name: fold_imm16_across_reg_sequence
+tracksRegLiveness: true
+registers:
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-LABEL: name: fold_imm16_across_reg_sequence
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[V_MOV_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, -1, 0, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B16_t16_e64_1:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, -1, 0, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_MOV_B16_t16_e64_]], %subreg.lo16, [[V_MOV_B16_t16_e64_1]], %subreg.hi16
+ ; CHECK-NEXT: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F32_e64 0, -1, 0, -1, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F32_e64_]]
+ ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
+ %0:vgpr_16 = V_MOV_B16_t16_e64 0, -1, 0, implicit $exec
+ %1:vgpr_16 = V_MOV_B16_t16_e64 0, -1, 0, implicit $exec
+ %2:vgpr_32 = REG_SEQUENCE %0, %subreg.lo16, %1, %subreg.hi16
+ %3:vgpr_32 = nofpexcept V_MAX_F32_e64 0, %2, 0, %2, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %3
+ S_ENDPGM 0, implicit $vgpr0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll b/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll
index f2ecfe8..3d74b17 100644
--- a/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll
@@ -17,16 +17,16 @@ define amdgpu_kernel void @v_ashr_pk_i8_i32(ptr addrspace(1) %out, i32 %src0, i3
;
; GFX1250-LABEL: v_ashr_pk_i8_i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_and_b32 s2, s2, 31
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_mov_b32_e32 v0, s2
; GFX1250-NEXT: v_ashr_pk_i8_i32 v0, s0, s1, v0
-; GFX1250-NEXT: global_store_b16 v1, v0, s[4:5]
+; GFX1250-NEXT: global_store_b16 v1, v0, s[6:7]
; GFX1250-NEXT: s_endpgm
%insert.0 = insertelement <2 x i32> poison, i32 %src0, i64 0
%build_vector = insertelement <2 x i32> %insert.0, i32 %src1, i64 1
@@ -58,16 +58,16 @@ define amdgpu_kernel void @v_ashr_pk_u8_i32(ptr addrspace(1) %out, i32 %src0, i3
;
; GFX1250-LABEL: v_ashr_pk_u8_i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_and_b32 s2, s2, 31
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_mov_b32_e32 v0, s2
; GFX1250-NEXT: v_ashr_pk_u8_i32 v0, s0, s1, v0
-; GFX1250-NEXT: global_store_b16 v1, v0, s[4:5]
+; GFX1250-NEXT: global_store_b16 v1, v0, s[6:7]
; GFX1250-NEXT: s_endpgm
%insert.0 = insertelement <2 x i32> poison, i32 %src0, i64 0
%build_vector = insertelement <2 x i32> %insert.0, i32 %src1, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
index af8b9e7..6fe99d8 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
@@ -520,6 +520,7 @@ body: |
; GCN-NEXT: GLOBAL_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr4_vgpr5, 16, 0, implicit $exec
; GCN-NEXT: S_WAIT_KMCNT 0
; GCN-NEXT: $sgpr2 = S_ADD_I32 $sgpr0, 100, implicit-def $scc
+ ; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 20, implicit $exec
$sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
$vgpr0 = V_MOV_B32_e32 1, implicit $exec
@@ -921,7 +922,6 @@ body: |
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
...
-# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0.
---
name: wait_kmcnt_with_outstanding_vmem
tracksRegLiveness: true
@@ -937,6 +937,7 @@ body: |
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
; GCN-NEXT: S_WAIT_KMCNT 0
; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
+ ; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
@@ -944,7 +945,6 @@ body: |
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
...
-# FIXME: Missing S_WAIT_XCNT before overwriting sgpr0.
---
name: wait_loadcnt_with_outstanding_smem
tracksRegLiveness: true
@@ -960,6 +960,7 @@ body: |
; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; GCN-NEXT: S_WAIT_LOADCNT 0
; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec
+ ; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $sgpr0 = S_MOV_B32 0
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
@@ -967,7 +968,6 @@ body: |
$sgpr0 = S_MOV_B32 0
...
-# TODO: Unnecessary wait before overwriting vgpr0.
---
name: overwrite_vgpr_after_smem
tracksRegLiveness: true
@@ -981,14 +981,12 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
- ; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
...
-# TODO: Unnecessary wait before overwriting sgpr0.
---
name: overwrite_sgpr_after_vmem
tracksRegLiveness: true
@@ -1002,7 +1000,6 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
- ; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $sgpr0 = S_MOV_B32 0
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
index a392692..6636eb5 100644
--- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
@@ -211,38 +211,39 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac
; GFX1250-SDAG-LABEL: workgroup_id_xyz:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_bfe_u32 s0, ttmp6, 0x40014
-; GFX1250-SDAG-NEXT: s_lshr_b32 s6, ttmp7, 16
-; GFX1250-SDAG-NEXT: s_add_co_i32 s7, s0, 1
+; GFX1250-SDAG-NEXT: s_lshr_b32 s8, ttmp7, 16
+; GFX1250-SDAG-NEXT: s_add_co_i32 s9, s0, 1
+; GFX1250-SDAG-NEXT: s_clause 0x1
; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-SDAG-NEXT: s_mul_i32 s4, s8, s9
; GFX1250-SDAG-NEXT: s_bfe_u32 s9, ttmp6, 0x40010
-; GFX1250-SDAG-NEXT: s_mul_i32 s7, s6, s7
-; GFX1250-SDAG-NEXT: s_bfe_u32 s8, ttmp6, 0x40008
+; GFX1250-SDAG-NEXT: s_bfe_u32 s5, ttmp6, 0x40008
; GFX1250-SDAG-NEXT: s_and_b32 s10, ttmp7, 0xffff
; GFX1250-SDAG-NEXT: s_add_co_i32 s9, s9, 1
; GFX1250-SDAG-NEXT: s_bfe_u32 s11, ttmp6, 0x4000c
-; GFX1250-SDAG-NEXT: s_add_co_i32 s8, s8, s7
-; GFX1250-SDAG-NEXT: s_mul_i32 s7, s10, s9
+; GFX1250-SDAG-NEXT: s_add_co_i32 s5, s5, s4
+; GFX1250-SDAG-NEXT: s_mul_i32 s4, s10, s9
; GFX1250-SDAG-NEXT: s_bfe_u32 s9, ttmp6, 0x40004
; GFX1250-SDAG-NEXT: s_add_co_i32 s11, s11, 1
-; GFX1250-SDAG-NEXT: s_add_co_i32 s9, s9, s7
-; GFX1250-SDAG-NEXT: s_and_b32 s7, ttmp6, 15
+; GFX1250-SDAG-NEXT: s_add_co_i32 s9, s9, s4
+; GFX1250-SDAG-NEXT: s_and_b32 s4, ttmp6, 15
; GFX1250-SDAG-NEXT: s_mul_i32 s11, ttmp9, s11
; GFX1250-SDAG-NEXT: s_getreg_b32 s12, hwreg(HW_REG_IB_STS2, 6, 4)
-; GFX1250-SDAG-NEXT: s_add_co_i32 s7, s7, s11
+; GFX1250-SDAG-NEXT: s_add_co_i32 s4, s4, s11
; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s12, 0
-; GFX1250-SDAG-NEXT: s_cselect_b32 s7, ttmp9, s7
+; GFX1250-SDAG-NEXT: s_cselect_b32 s4, ttmp9, s4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s7
-; GFX1250-SDAG-NEXT: s_cselect_b32 s7, s10, s9
-; GFX1250-SDAG-NEXT: s_cselect_b32 s6, s6, s8
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s6
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-SDAG-NEXT: s_cselect_b32 s4, s10, s9
+; GFX1250-SDAG-NEXT: s_cselect_b32 s5, s8, s5
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: s_clause 0x2
; GFX1250-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1250-SDAG-NEXT: global_store_b32 v0, v2, s[2:3]
-; GFX1250-SDAG-NEXT: global_store_b32 v0, v3, s[4:5]
+; GFX1250-SDAG-NEXT: global_store_b32 v0, v3, s[6:7]
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: workgroup_id_xyz:
@@ -250,39 +251,40 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac
; GFX1250-GISEL-NEXT: s_bfe_u32 s0, ttmp6, 0x4000c
; GFX1250-GISEL-NEXT: s_and_b32 s1, ttmp6, 15
; GFX1250-GISEL-NEXT: s_add_co_i32 s0, s0, 1
-; GFX1250-GISEL-NEXT: s_getreg_b32 s6, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-GISEL-NEXT: s_getreg_b32 s8, hwreg(HW_REG_IB_STS2, 6, 4)
; GFX1250-GISEL-NEXT: s_mul_i32 s0, ttmp9, s0
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-GISEL-NEXT: s_add_co_i32 s1, s1, s0
-; GFX1250-GISEL-NEXT: s_cmp_eq_u32 s6, 0
-; GFX1250-GISEL-NEXT: s_cselect_b32 s7, ttmp9, s1
+; GFX1250-GISEL-NEXT: s_cmp_eq_u32 s8, 0
+; GFX1250-GISEL-NEXT: s_cselect_b32 s9, ttmp9, s1
; GFX1250-GISEL-NEXT: s_bfe_u32 s0, ttmp6, 0x40010
-; GFX1250-GISEL-NEXT: s_and_b32 s8, ttmp7, 0xffff
+; GFX1250-GISEL-NEXT: s_and_b32 s10, ttmp7, 0xffff
; GFX1250-GISEL-NEXT: s_add_co_i32 s0, s0, 1
-; GFX1250-GISEL-NEXT: s_bfe_u32 s9, ttmp6, 0x40004
-; GFX1250-GISEL-NEXT: s_mul_i32 s10, s8, s0
+; GFX1250-GISEL-NEXT: s_bfe_u32 s11, ttmp6, 0x40004
+; GFX1250-GISEL-NEXT: s_mul_i32 s12, s10, s0
+; GFX1250-GISEL-NEXT: s_clause 0x1
; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
+; GFX1250-GISEL-NEXT: s_add_co_i32 s11, s11, s12
+; GFX1250-GISEL-NEXT: s_cmp_eq_u32 s8, 0
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, s9
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
-; GFX1250-GISEL-NEXT: s_add_co_i32 s9, s9, s10
-; GFX1250-GISEL-NEXT: s_cmp_eq_u32 s6, 0
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, s7
-; GFX1250-GISEL-NEXT: s_cselect_b32 s8, s8, s9
-; GFX1250-GISEL-NEXT: s_bfe_u32 s9, ttmp6, 0x40014
+; GFX1250-GISEL-NEXT: s_cselect_b32 s4, s10, s11
+; GFX1250-GISEL-NEXT: s_bfe_u32 s5, ttmp6, 0x40014
; GFX1250-GISEL-NEXT: s_lshr_b32 s10, ttmp7, 16
-; GFX1250-GISEL-NEXT: s_add_co_i32 s9, s9, 1
+; GFX1250-GISEL-NEXT: s_add_co_i32 s5, s5, 1
; GFX1250-GISEL-NEXT: s_bfe_u32 s11, ttmp6, 0x40008
-; GFX1250-GISEL-NEXT: s_mul_i32 s9, s10, s9
+; GFX1250-GISEL-NEXT: s_mul_i32 s5, s10, s5
; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1250-GISEL-NEXT: s_add_co_i32 s11, s11, s9
-; GFX1250-GISEL-NEXT: s_cmp_eq_u32 s6, 0
-; GFX1250-GISEL-NEXT: s_cselect_b32 s6, s10, s11
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v3, s6
+; GFX1250-GISEL-NEXT: s_add_co_i32 s11, s11, s5
+; GFX1250-GISEL-NEXT: s_cmp_eq_u32 s8, 0
+; GFX1250-GISEL-NEXT: s_cselect_b32 s5, s10, s11
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: s_clause 0x2
; GFX1250-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1250-GISEL-NEXT: global_store_b32 v1, v2, s[2:3]
-; GFX1250-GISEL-NEXT: global_store_b32 v1, v3, s[4:5]
+; GFX1250-GISEL-NEXT: global_store_b32 v1, v3, s[6:7]
; GFX1250-GISEL-NEXT: s_endpgm
; GFX12-LABEL: workgroup_id_xyz:
; GFX12: ; %bb.0: