aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/GlobalISel
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/GlobalISel')
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll46
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll274
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll340
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll84
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll108
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll38
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.ll165
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll303
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll57
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll72
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fshr-new-regbank-select.ll35
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/global-value-addrspaces.ll104
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll70
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll432
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll9
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll126
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir12
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant-32bit.mir32
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-constant-32bit.mir36
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zextload-constant-32bit.mir48
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll356
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll213
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll323
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll34
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant32bit.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll7
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-uniform-in-vgpr.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs-i64.ll5
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll184
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll1191
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-wave-address.mir7
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fmul.mir5
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir146
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir16
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-widen-scalar-loads.mir152
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll3277
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll1625
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll522
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll3752
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll1226
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll7
71 files changed, 8119 insertions, 7594 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll
index e117200..d6f1b14 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll
@@ -50,7 +50,7 @@ define i16 @s_add_i16(i16 inreg %a, i16 inreg %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s0, s0, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%c = add i16 %a, %b
@@ -145,7 +145,7 @@ define i32 @s_add_i32(i32 inreg %a, i32 inreg %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s0, s0, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%c = add i32 %a, %b
@@ -263,11 +263,11 @@ define <2 x i16> @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) {
; GFX12-NEXT: s_lshr_b32 s2, s0, 16
; GFX12-NEXT: s_lshr_b32 s3, s1, 16
; GFX12-NEXT: s_add_co_i32 s0, s0, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s2, s2, s3
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%c = add <2 x i16> %a, %b
@@ -374,7 +374,7 @@ define i64 @s_add_i64(i64 inreg %a, i64 inreg %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: s_setpc_b64 s[30:31]
%c = add i64 %a, %b
@@ -425,7 +425,7 @@ define i64 @v_add_i64(i64 %a, i64 %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX12-NEXT: s_setpc_b64 s[30:31]
%c = add i64 %a, %b
@@ -513,7 +513,7 @@ define void @s_uaddo_uadde(i64 inreg %a, i64 inreg %b, ptr addrspace(1) %res, pt
; GFX12-NEXT: s_add_co_u32 s0, s0, s2
; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s3
; GFX12-NEXT: s_cselect_b32 s2, 1, 0
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
; GFX12-NEXT: v_mov_b32_e32 v6, s2
; GFX12-NEXT: global_store_b64 v[0:1], v[4:5], off
@@ -593,9 +593,9 @@ define void @v_uaddo_uadde(i64 %a, i64 %b, ptr addrspace(1) %res, ptr addrspace(
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX12-NEXT: global_store_b64 v[4:5], v[0:1], off
; GFX12-NEXT: global_store_b32 v[6:7], v2, off
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index ff618c0..8063b29 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -622,9 +622,9 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -779,9 +779,9 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1212,9 +1212,9 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1367,9 +1367,9 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1833,7 +1833,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2000,7 +2000,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
index 007417c..5b0b602 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
@@ -622,9 +622,9 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -779,9 +779,9 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1212,9 +1212,9 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1367,9 +1367,9 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1833,7 +1833,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2000,7 +2000,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
index 77d212a..7f3e24f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
@@ -3069,7 +3069,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: flat_store_b32 v[0:1], v3
@@ -4161,7 +4161,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
-; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
index dd01112..121dd30 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -240,11 +240,11 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
-; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB4_6
; GFX10-NEXT: ; %bb.1: ; %loop.start.preheader
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: s_mov_b32 s8, 0
; GFX10-NEXT: ; implicit-def: $sgpr10
; GFX10-NEXT: ; implicit-def: $sgpr11
; GFX10-NEXT: ; implicit-def: $sgpr9
@@ -303,7 +303,7 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
; GFX10-NEXT: v_mov_b32_e32 v0, 5
; GFX10-NEXT: flat_store_dword v[3:4], v0
; GFX10-NEXT: .LBB4_8: ; %exit
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -345,8 +345,8 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
; GFX10-LABEL: divergent_i1_icmp_used_outside_loop:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s6, 0
; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: s_mov_b32 s6, 0
; GFX10-NEXT: ; implicit-def: $sgpr7
; GFX10-NEXT: s_branch .LBB5_2
; GFX10-NEXT: .LBB5_1: ; %Flow
@@ -377,7 +377,7 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
; GFX10-NEXT: global_store_dword v[8:9], v5, off
; GFX10-NEXT: .LBB5_4: ; %loop.break.block
; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s4, v1
; GFX10-NEXT: s_mov_b32 s5, exec_lo
@@ -397,7 +397,7 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
; GFX10-NEXT: ; %bb.7: ; %if.block.1
; GFX10-NEXT: global_store_dword v[6:7], v4, off
; GFX10-NEXT: .LBB5_8: ; %exit
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -457,8 +457,8 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa
; GFX10-LABEL: divergent_i1_freeze_used_outside_loop:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_mov_b32 s1, exec_lo
-; GFX10-NEXT: s_mov_b32 s2, 0
; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: s_mov_b32 s2, 0
; GFX10-NEXT: ; implicit-def: $sgpr4
; GFX10-NEXT: ; implicit-def: $sgpr3
; GFX10-NEXT: s_branch .LBB6_2
@@ -534,15 +534,15 @@ exit:
define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %a.break) {
; GFX10-LABEL: loop_with_1break:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: ; implicit-def: $sgpr6
; GFX10-NEXT: ; implicit-def: $sgpr7
; GFX10-NEXT: ; implicit-def: $sgpr5
; GFX10-NEXT: s_branch .LBB7_2
; GFX10-NEXT: .LBB7_1: ; %Flow
; GFX10-NEXT: ; in Loop: Header=BB7_2 Depth=1
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-NEXT: s_and_b32 s1, exec_lo, s6
; GFX10-NEXT: s_or_b32 s4, s1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
index fd08ab8..5c57d35 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
@@ -106,13 +106,13 @@ exit:
define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a) {
; GFX10-LABEL: loop_with_1break:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: ; implicit-def: $sgpr5
; GFX10-NEXT: s_branch .LBB2_2
; GFX10-NEXT: .LBB2_1: ; %Flow
; GFX10-NEXT: ; in Loop: Header=BB2_2 Depth=1
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-NEXT: s_and_b32 s1, exec_lo, s5
; GFX10-NEXT: s_or_b32 s4, s1, s4
@@ -180,13 +180,13 @@ exit:
define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b) {
; GFX10-LABEL: loop_with_2breaks:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: ; implicit-def: $sgpr5
; GFX10-NEXT: s_branch .LBB3_3
; GFX10-NEXT: .LBB3_1: ; %Flow3
; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7
; GFX10-NEXT: s_andn2_b32 s2, s5, exec_lo
; GFX10-NEXT: s_and_b32 s3, exec_lo, s6
@@ -278,13 +278,13 @@ exit:
define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c) {
; GFX10-LABEL: loop_with_3breaks:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: ; implicit-def: $sgpr5
; GFX10-NEXT: s_branch .LBB4_4
; GFX10-NEXT: .LBB4_1: ; %Flow5
; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX10-NEXT: s_andn2_b32 s2, s6, exec_lo
; GFX10-NEXT: s_and_b32 s3, exec_lo, s8
@@ -404,15 +404,15 @@ exit:
define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %a.break) {
; GFX10-LABEL: loop_with_div_break_with_body:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: ; implicit-def: $sgpr6
; GFX10-NEXT: ; implicit-def: $sgpr7
; GFX10-NEXT: ; implicit-def: $sgpr5
; GFX10-NEXT: s_branch .LBB5_2
; GFX10-NEXT: .LBB5_1: ; %Flow
; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-NEXT: s_and_b32 s1, exec_lo, s6
; GFX10-NEXT: s_or_b32 s4, s1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
index d13d6a1..a8b27ec 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
@@ -101,8 +101,8 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr ad
; GFX10-LABEL: loop_with_1break:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_mov_b32_e32 v3, 0
-; GFX10-NEXT: s_mov_b32 s8, 0
; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: s_mov_b32 s8, 0
; GFX10-NEXT: ; implicit-def: $sgpr10
; GFX10-NEXT: ; implicit-def: $sgpr9
; GFX10-NEXT: s_branch .LBB2_3
@@ -131,7 +131,7 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr ad
; GFX10-NEXT: s_andn2_b32 s6, s9, exec_lo
; GFX10-NEXT: s_and_b32 s5, exec_lo, s5
; GFX10-NEXT: s_or_b32 s9, s6, s5
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_cbranch_execz .LBB2_5
; GFX10-NEXT: .LBB2_3: ; %A
@@ -197,14 +197,14 @@ define void @nested_loops_temporal_divergence_inner(float %pre.cond.val, i32 %n.
; GFX10-LABEL: nested_loops_temporal_divergence_inner:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cmp_lt_f32_e64 s8, 1.0, v0
-; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v0
; GFX10-NEXT: s_mov_b32 s6, 0
+; GFX10-NEXT: s_mov_b32 s8, 0
; GFX10-NEXT: .LBB3_1: ; %OuterHeader
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB3_2 Depth 2
; GFX10-NEXT: s_ashr_i32 s7, s6, 31
-; GFX10-NEXT: s_mov_b32 s4, s8
+; GFX10-NEXT: s_mov_b32 s4, s5
; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], 2
; GFX10-NEXT: ; implicit-def: $sgpr9
; GFX10-NEXT: v_mov_b32_e32 v6, s10
@@ -239,13 +239,13 @@ define void @nested_loops_temporal_divergence_inner(float %pre.cond.val, i32 %n.
; GFX10-NEXT: s_add_i32 s6, s6, 1
; GFX10-NEXT: v_add_co_u32 v6, s4, v4, v6
; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v5, v7, s4
-; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
; GFX10-NEXT: flat_store_byte v[6:7], v0
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_cbranch_execnz .LBB3_1
; GFX10-NEXT: ; %bb.4: ; %exit
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -288,14 +288,14 @@ define void @nested_loops_temporal_divergence_outer(float %pre.cond.val, i32 %n.
; GFX10-LABEL: nested_loops_temporal_divergence_outer:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cmp_lt_f32_e64 s8, 1.0, v0
-; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v0
; GFX10-NEXT: s_mov_b32 s6, 0
+; GFX10-NEXT: s_mov_b32 s8, 0
; GFX10-NEXT: .LBB4_1: ; %OuterHeader
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB4_2 Depth 2
; GFX10-NEXT: s_ashr_i32 s7, s6, 31
-; GFX10-NEXT: s_mov_b32 s4, s8
+; GFX10-NEXT: s_mov_b32 s4, s5
; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], 2
; GFX10-NEXT: ; implicit-def: $sgpr9
; GFX10-NEXT: v_mov_b32_e32 v6, s10
@@ -330,13 +330,13 @@ define void @nested_loops_temporal_divergence_outer(float %pre.cond.val, i32 %n.
; GFX10-NEXT: s_add_i32 s6, s6, 1
; GFX10-NEXT: v_add_co_u32 v6, s4, v4, v6
; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v5, v7, s4
-; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
; GFX10-NEXT: flat_store_byte v[6:7], v0
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_cbranch_execnz .LBB4_1
; GFX10-NEXT: ; %bb.4: ; %exit
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -379,15 +379,15 @@ define void @nested_loops_temporal_divergence_both(float %pre.cond.val, i32 %n.i
; GFX10-LABEL: nested_loops_temporal_divergence_both:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cmp_lt_f32_e64 s8, 1.0, v0
-; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v0
; GFX10-NEXT: s_mov_b32 s6, 0
+; GFX10-NEXT: s_mov_b32 s8, 0
; GFX10-NEXT: ; implicit-def: $sgpr9
; GFX10-NEXT: .LBB5_1: ; %OuterHeader
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB5_2 Depth 2
; GFX10-NEXT: s_ashr_i32 s7, s6, 31
-; GFX10-NEXT: s_mov_b32 s4, s8
+; GFX10-NEXT: s_mov_b32 s4, s5
; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], 2
; GFX10-NEXT: v_mov_b32_e32 v8, s10
; GFX10-NEXT: v_mov_b32_e32 v9, s11
@@ -421,13 +421,13 @@ define void @nested_loops_temporal_divergence_both(float %pre.cond.val, i32 %n.i
; GFX10-NEXT: s_add_i32 s6, s6, 1
; GFX10-NEXT: v_add_co_u32 v8, s4, v4, v8
; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v5, v9, s4
-; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
; GFX10-NEXT: flat_store_byte v[8:9], v0
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_cbranch_execnz .LBB5_1
; GFX10-NEXT: ; %bb.4: ; %exit
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: flat_store_byte v[6:7], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
index 990e4f6..a28827a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
@@ -28,7 +28,7 @@ define i32 @v_extract_v64i32_varidx(ptr addrspace(1) %ptr, i32 %idx) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v2
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: global_load_b32 v0, v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -61,7 +61,7 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 1, v2
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: global_load_u16 v0, v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -94,7 +94,7 @@ define i64 @v_extract_v32i64_varidx(ptr addrspace(1) %ptr, i32 %idx) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v2
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
index 405861d..9dfd0a4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
@@ -10,41 +10,75 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_sgpr_idx(ptr addrspace(4) inre
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s0, s4, 3
; GFX9-NEXT: s_lshl_b32 s0, s0, 4
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], s0 offset:0x0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: v_readfirstlane_b32 s2, v2
+; GFX9-NEXT: v_readfirstlane_b32 s3, v3
; GFX9-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s0, s4, 3
; GFX8-NEXT: s_lshl_b32 s0, s0, 4
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], s0
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_add_u32 s0, s2, s0
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: v_readfirstlane_b32 s2, v2
+; GFX8-NEXT: v_readfirstlane_b32 s3, v3
; GFX8-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_and_b32 s0, s4, 3
-; GFX7-NEXT: s_lshl_b32 s0, s0, 4
-; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], s0
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_and_b32 s2, s4, 3
+; GFX7-NEXT: s_lshl_b32 s4, s2, 4
+; GFX7-NEXT: s_mov_b32 s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s2, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: v_readfirstlane_b32 s1, v1
+; GFX7-NEXT: v_readfirstlane_b32 s2, v2
+; GFX7-NEXT: v_readfirstlane_b32 s3, v3
; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_and_b32 s0, s4, 3
; GFX10-NEXT: s_lshl_b32 s0, s0, 4
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], s0 offset:0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10-NEXT: v_readfirstlane_b32 s2, v2
+; GFX10-NEXT: v_readfirstlane_b32 s3, v3
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_and_b32 s0, s4, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_lshl_b32 s0, s0, 4
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], s0 offset:0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: v_readfirstlane_b32 s2, v2
+; GFX11-NEXT: v_readfirstlane_b32 s3, v3
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i128>, ptr addrspace(4) %ptr
%element = extractelement <4 x i128> %vector, i32 %idx
@@ -281,22 +315,63 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inre
}
define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx0(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i128_idx0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i128_idx0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: v_readfirstlane_b32 s2, v2
+; GFX9-NEXT: v_readfirstlane_b32 s3, v3
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i128_idx0:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: v_readfirstlane_b32 s2, v2
+; GFX8-NEXT: v_readfirstlane_b32 s3, v3
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i128_idx0:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: v_readfirstlane_b32 s1, v1
+; GFX7-NEXT: v_readfirstlane_b32 s2, v2
+; GFX7-NEXT: v_readfirstlane_b32 s3, v3
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i128_idx0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10-NEXT: v_readfirstlane_b32 s2, v2
+; GFX10-NEXT: v_readfirstlane_b32 s3, v3
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i128_idx0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: v_readfirstlane_b32 s2, v2
+; GFX11-NEXT: v_readfirstlane_b32 s3, v3
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i128>, ptr addrspace(4) %ptr
%element = extractelement <4 x i128> %vector, i32 0
@@ -306,32 +381,63 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx0(ptr addrspace(4) inreg %p
define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx1(ptr addrspace(4) inreg %ptr) {
; GFX9-LABEL: extractelement_sgpr_v4i128_idx1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x10
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: v_readfirstlane_b32 s2, v2
+; GFX9-NEXT: v_readfirstlane_b32 s3, v3
; GFX9-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: extractelement_sgpr_v4i128_idx1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x10
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_add_u32 s0, s2, 16
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: v_readfirstlane_b32 s2, v2
+; GFX8-NEXT: v_readfirstlane_b32 s3, v3
; GFX8-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: extractelement_sgpr_v4i128_idx1:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: v_readfirstlane_b32 s1, v1
+; GFX7-NEXT: v_readfirstlane_b32 s2, v2
+; GFX7-NEXT: v_readfirstlane_b32 s3, v3
; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i128_idx1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x10
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10-NEXT: v_readfirstlane_b32 s2, v2
+; GFX10-NEXT: v_readfirstlane_b32 s3, v3
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i128_idx1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:16
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: v_readfirstlane_b32 s2, v2
+; GFX11-NEXT: v_readfirstlane_b32 s3, v3
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i128>, ptr addrspace(4) %ptr
%element = extractelement <4 x i128> %vector, i32 1
@@ -341,32 +447,63 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx1(ptr addrspace(4) inreg %p
define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx2(ptr addrspace(4) inreg %ptr) {
; GFX9-LABEL: extractelement_sgpr_v4i128_idx2:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x20
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:32
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: v_readfirstlane_b32 s2, v2
+; GFX9-NEXT: v_readfirstlane_b32 s3, v3
; GFX9-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: extractelement_sgpr_v4i128_idx2:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x20
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_add_u32 s0, s2, 32
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: v_readfirstlane_b32 s2, v2
+; GFX8-NEXT: v_readfirstlane_b32 s3, v3
; GFX8-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: extractelement_sgpr_v4i128_idx2:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: v_readfirstlane_b32 s1, v1
+; GFX7-NEXT: v_readfirstlane_b32 s2, v2
+; GFX7-NEXT: v_readfirstlane_b32 s3, v3
; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i128_idx2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x20
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:32
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10-NEXT: v_readfirstlane_b32 s2, v2
+; GFX10-NEXT: v_readfirstlane_b32 s3, v3
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i128_idx2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x20
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:32
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: v_readfirstlane_b32 s2, v2
+; GFX11-NEXT: v_readfirstlane_b32 s3, v3
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i128>, ptr addrspace(4) %ptr
%element = extractelement <4 x i128> %vector, i32 2
@@ -376,32 +513,63 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx2(ptr addrspace(4) inreg %p
define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx3(ptr addrspace(4) inreg %ptr) {
; GFX9-LABEL: extractelement_sgpr_v4i128_idx3:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x30
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:48
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: v_readfirstlane_b32 s2, v2
+; GFX9-NEXT: v_readfirstlane_b32 s3, v3
; GFX9-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: extractelement_sgpr_v4i128_idx3:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x30
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_add_u32 s0, s2, 48
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: v_readfirstlane_b32 s2, v2
+; GFX8-NEXT: v_readfirstlane_b32 s3, v3
; GFX8-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: extractelement_sgpr_v4i128_idx3:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0xc
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: v_readfirstlane_b32 s1, v1
+; GFX7-NEXT: v_readfirstlane_b32 s2, v2
+; GFX7-NEXT: v_readfirstlane_b32 s3, v3
; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i128_idx3:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x30
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:48
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10-NEXT: v_readfirstlane_b32 s2, v2
+; GFX10-NEXT: v_readfirstlane_b32 s3, v3
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i128_idx3:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x30
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:48
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: v_readfirstlane_b32 s2, v2
+; GFX11-NEXT: v_readfirstlane_b32 s3, v3
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i128>, ptr addrspace(4) %ptr
%element = extractelement <4 x i128> %vector, i32 3
@@ -585,3 +753,5 @@ define i128 @extractelement_vgpr_v4i128_idx3(ptr addrspace(1) %ptr) {
%element = extractelement <4 x i128> %vector, i32 3
ret i128 %element
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll
new file mode 100644
index 0000000..39a793c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll
@@ -0,0 +1,340 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12 %s
+
+define amdgpu_ps void @v_fabs_f16(half %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fabs_f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GCN-NEXT: global_store_b16 v[1:2], v0, off
+; GCN-NEXT: s_endpgm
+ %fabs = call half @llvm.fabs.f16(half %in)
+ store half %fabs, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fabs_f16(half inreg %in, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fabs_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_and_b32_e64 v2, 0x7fff, s0
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fabs_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %fabs = call half @llvm.fabs.f16(half %in)
+ store half %fabs, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fabs_f16_salu_use(half inreg %in, i32 inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fabs_f16_salu_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_and_b32_e64 v2, 0x7fff, s0
+; GFX11-NEXT: s_cmp_eq_u32 s1, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: s_cselect_b32 s0, s0, 0
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fabs_f16_salu_use:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX12-NEXT: s_cmp_eq_u32 s1, 0
+; GFX12-NEXT: s_cselect_b32 s0, s0, 0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %fabs = call half @llvm.fabs.f16(half %in)
+ %cond = icmp eq i32 %val, 0
+ %sel = select i1 %cond, half %fabs, half 0.0
+ store half %sel, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_fabs_f32(float %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fabs_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-NEXT: global_store_b32 v[1:2], v0, off
+; GCN-NEXT: s_endpgm
+ %fabs = call float @llvm.fabs.f32(float %in)
+ store float %fabs, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fabs_f32(float inreg %in, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fabs_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_and_b32_e64 v2, 0x7fffffff, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fabs_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_bitset0_b32 s0, 31
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %fabs = call float @llvm.fabs.f32(float %in)
+ store float %fabs, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fabs_f32_salu_use(float inreg %in, i32 inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fabs_f32_salu_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_and_b32_e64 v2, 0x7fffffff, s0
+; GFX11-NEXT: s_cmp_eq_u32 s1, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: s_cselect_b32 s0, s0, 0
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fabs_f32_salu_use:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_bitset0_b32 s0, 31
+; GFX12-NEXT: s_cmp_eq_u32 s1, 0
+; GFX12-NEXT: s_cselect_b32 s0, s0, 0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %fabs = call float @llvm.fabs.f32(float %in)
+ %cond = icmp eq i32 %val, 0
+ %sel = select i1 %cond, float %fabs, float 0.0
+ store float %sel, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_fabs_f64(double %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fabs_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
+; GCN-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GCN-NEXT: s_endpgm
+ %fabs = call double @llvm.fabs.f64(double %in)
+ store double %fabs, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fabs_f64(double inreg %in, ptr addrspace(1) %out) {
+; GCN-LABEL: s_fabs_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3
+; GCN-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GCN-NEXT: s_endpgm
+ %fabs = call double @llvm.fabs.f64(double %in)
+ store double %fabs, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fabs_f64_salu_use(double inreg %in, i32 inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fabs_f64_salu_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: s_cmp_eq_u32 s2, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_readfirstlane_b32 s1, v3
+; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], 0
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fabs_f64_salu_use:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: s_cmp_eq_u32 s2, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX12-NEXT: v_readfirstlane_b32 s0, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_readfirstlane_b32 s1, v3
+; GFX12-NEXT: s_cselect_b64 s[0:1], s[0:1], 0
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT: s_endpgm
+ %fabs = call double @llvm.fabs.f64(double %in)
+ %cond = icmp eq i32 %val, 0
+ %sel = select i1 %cond, double %fabs, double 0.0
+ store double %sel, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_fabs_v2f16(<2 x half> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fabs_v2f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GCN-NEXT: global_store_b32 v[1:2], v0, off
+; GCN-NEXT: s_endpgm
+ %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
+ store <2 x half> %fabs, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fabs_v2f16(<2 x half> inreg %in, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fabs_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_and_b32_e64 v2, 0x7fff7fff, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fabs_v2f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_lshr_b32 s1, s0, 16
+; GFX12-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX12-NEXT: s_and_b32 s1, s1, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
+ store <2 x half> %fabs, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fabs_v2f16_salu_use(<2 x half> inreg %in, i32 inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fabs_v2f16_salu_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_and_b32_e64 v2, 0x7fff7fff, s0
+; GFX11-NEXT: s_cmp_eq_u32 s1, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: s_cselect_b32 s0, s0, 0
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fabs_v2f16_salu_use:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_lshr_b32 s2, s0, 16
+; GFX12-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX12-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX12-NEXT: s_cmp_eq_u32 s1, 0
+; GFX12-NEXT: s_pack_ll_b32_b16 s0, s0, s2
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_cselect_b32 s0, s0, 0
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
+ %cond = icmp eq i32 %val, 0
+ %sel = select i1 %cond, <2 x half> %fabs, <2 x half> <half 0.0, half 0.0>
+ store <2 x half> %sel, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_fabs_v2f32(<2 x float> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fabs_v2f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
+; GCN-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GCN-NEXT: s_endpgm
+ %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
+ store <2 x float> %fabs, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fabs_v2f32(<2 x float> inreg %in, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fabs_v2f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_and_b32_e64 v2, 0x7fffffff, s0
+; GFX11-NEXT: v_and_b32_e64 v3, 0x7fffffff, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: v_readfirstlane_b32 s1, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fabs_v2f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_bitset0_b32 s0, 31
+; GFX12-NEXT: s_bitset0_b32 s1, 31
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT: s_endpgm
+ %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
+ store <2 x float> %fabs, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fabs_v2f32_salu_use(<2 x float> inreg %in, i32 inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fabs_v2f32_salu_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_and_b32_e64 v2, 0x7fffffff, s0
+; GFX11-NEXT: v_and_b32_e64 v3, 0x7fffffff, s1
+; GFX11-NEXT: s_cmp_eq_u32 s2, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: v_readfirstlane_b32 s1, v3
+; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fabs_v2f32_salu_use:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_bitset0_b32 s0, 31
+; GFX12-NEXT: s_bitset0_b32 s1, 31
+; GFX12-NEXT: s_cmp_eq_u32 s2, 0
+; GFX12-NEXT: s_cselect_b64 s[0:1], s[0:1], 0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT: s_endpgm
+ %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
+ %cond = icmp eq i32 %val, 0
+ %sel = select i1 %cond, <2 x float> %fabs, <2 x float> <float 0.0, float 0.0>
+ store <2 x float> %sel, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_fabs_fneg_f32(float %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fabs_fneg_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_or_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: global_store_b32 v[1:2], v0, off
+; GCN-NEXT: s_endpgm
+ %fabs = call float @llvm.fabs.f32(float %in)
+ %fneg = fneg float %fabs
+ store float %fneg, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fabs_fneg_f32(float inreg %in, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fabs_fneg_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_or_b32_e64 v2, 0x80000000, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fabs_fneg_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_bitset1_b32 s0, 31
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %fabs = call float @llvm.fabs.f32(float %in)
+ %fneg = fneg float %fabs
+ store float %fneg, ptr addrspace(1) %out
+ ret void
+}
+
+declare half @llvm.fabs.f16(half)
+declare float @llvm.fabs.f32(float)
+declare double @llvm.fabs.f64(double)
+declare <2 x half> @llvm.fabs.v2f16(<2 x half>)
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 1b879a6..62b264a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -174,7 +174,7 @@ define half @v_fdiv_f16(half %a, half %b) {
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v3, v3, v2
; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v2
@@ -218,7 +218,7 @@ define half @v_fdiv_f16_afn(half %a, half %b) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv afn half %a, %b
@@ -384,7 +384,7 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) {
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v3, v3, v2
; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v2
@@ -672,7 +672,7 @@ define half @v_fdiv_f16_afn_ulp25(half %a, half %b) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv afn half %a, %b
@@ -739,7 +739,7 @@ define half @v_fdiv_f16_arcp_ulp25(half %a, half %b) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp half %a, %b
@@ -1041,7 +1041,7 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v5
; GFX11-NEXT: v_rcp_f32_e32 v4, v4
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_dual_mul_f32 v6, v6, v3 :: v_dual_mul_f32 v7, v7, v4
; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1]
; GFX11-NEXT: v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
@@ -1115,7 +1115,7 @@ define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) {
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-NEXT: v_rcp_f16_e32 v2, v2
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
@@ -1419,7 +1419,7 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v5
; GFX11-NEXT: v_rcp_f32_e32 v4, v4
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_dual_mul_f32 v6, v6, v3 :: v_dual_mul_f32 v7, v7, v4
; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1]
; GFX11-NEXT: v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
@@ -1700,7 +1700,7 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_mix_f32 v5, -v0, v2, v4 op_sel_hi:[1,0,0]
; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX11-NEXT: v_fma_f32 v5, v5, v2, v2
@@ -1981,7 +1981,7 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
; GFX11-NEXT: v_fma_mix_f32 v6, v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX11-NEXT: v_fma_f32 v5, v5, v2, -v2
@@ -2277,7 +2277,7 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX11-NEXT: v_rcp_f32_e32 v4, v4
; GFX11-NEXT: v_mov_b32_e32 v5, 1.0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_mix_f32 v7, -|v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX11-NEXT: v_fma_f32 v7, v7, v4, v4
; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
@@ -2288,7 +2288,7 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, 1.0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_mix_f32 v6, -v1, v3, v5 op_sel_hi:[1,0,0]
; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3
; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0]
@@ -2579,7 +2579,7 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX11-NEXT: v_rcp_f32_e32 v4, v4
; GFX11-NEXT: v_mov_b32_e32 v5, -1.0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_mix_f32 v7, |v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX11-NEXT: v_fma_f32 v7, v7, v4, -v4
; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
@@ -2590,7 +2590,7 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, -1.0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_mix_f32 v6, v1, v3, v5 op_sel_hi:[1,0,0]
; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3
; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0]
@@ -2704,7 +2704,7 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_rcp_f16_e32 v0, v0
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp <2 x half> <half 1.0, half 1.0>, %x
@@ -2753,7 +2753,7 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_rcp_f16_e32 v0, v0
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp afn <2 x half> <half 1.0, half 1.0>, %x
@@ -3020,7 +3020,7 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_mix_f32 v5, -v0, v2, v4 op_sel_hi:[1,0,0]
; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX11-NEXT: v_fma_f32 v5, v5, v2, v2
@@ -3094,7 +3094,7 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-NEXT: v_rcp_f16_e32 v2, v2
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
@@ -3212,7 +3212,7 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-NEXT: v_rcp_f16_e32 v2, v2
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
@@ -3274,7 +3274,7 @@ define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-NEXT: v_rcp_f16_e32 v2, v2
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
@@ -3445,7 +3445,7 @@ define amdgpu_ps i16 @s_fdiv_f16(i16 inreg %a.arg, i16 inreg %b.arg) {
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s1
; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s0
; GFX11-NEXT: v_rcp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
; GFX11-NEXT: v_fma_mix_f32 v2, -s1, v1, s0 op_sel_hi:[1,0,1]
; GFX11-NEXT: v_fmac_f32_e32 v1, v2, v0
@@ -3523,7 +3523,7 @@ define amdgpu_ps i16 @s_fdiv_f16_arcp(i16 inreg %a.arg, i16 inreg %b.arg) {
; GFX11-LABEL: s_fdiv_f16_arcp:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_rcp_f16_e32 v0, s1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f16_e32 v0, s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
@@ -3562,7 +3562,7 @@ define amdgpu_ps i16 @s_fdiv_f16_afn(i16 inreg %a.arg, i16 inreg %b.arg) {
; GFX11-LABEL: s_fdiv_f16_afn:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_rcp_f16_e32 v0, s1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f16_e32 v0, s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
@@ -3883,7 +3883,7 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s3
; GFX11-NEXT: v_rcp_f32_e32 v0, v0
; GFX11-NEXT: v_rcp_f32_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_dual_mul_f32 v2, v2, v0 :: v_dual_mul_f32 v3, v3, v1
; GFX11-NEXT: v_fma_mix_f32 v4, -s1, v2, s0 op_sel_hi:[1,0,1]
; GFX11-NEXT: v_fma_mix_f32 v5, -s2, v3, s3 op_sel_hi:[1,0,1]
@@ -3963,7 +3963,7 @@ define amdgpu_ps i16 @s_rcp_f16(i16 inreg %a.arg) {
; GFX11-LABEL: s_rcp_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_rcp_f16_e32 v0, s0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%a = bitcast i16 %a.arg to half
@@ -4027,7 +4027,7 @@ define amdgpu_ps i16 @s_neg_rcp_f16(i16 inreg %a.arg) {
; GFX11-LABEL: s_neg_rcp_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_rcp_f16_e64 v0, -s0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%a = bitcast i16 %a.arg to half
@@ -4097,7 +4097,7 @@ define amdgpu_ps i16 @s_rsq_f16(i16 inreg %a.arg) {
; GFX11-LABEL: s_rsq_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_rsq_f16_e32 v0, s0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%a = bitcast i16 %a.arg to half
@@ -4384,12 +4384,12 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX11-NEXT: v_sqrt_f16_e32 v0, s0
; GFX11-NEXT: v_sqrt_f16_e32 v1, s1
; GFX11-NEXT: v_mov_b32_e32 v4, -1.0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
; GFX11-NEXT: v_fma_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0]
; GFX11-NEXT: v_fma_f32 v5, v5, v2, -v2
@@ -4540,7 +4540,7 @@ define half @v_neg_rsq_f16(half %a) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_rcp_f16_e64 v0, -v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract half @llvm.sqrt.f16(half %a)
@@ -4615,7 +4615,7 @@ define { half, half } @v_rsq_f16_multi_use(half %a) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_sqrt_f16_e32 v2, v0
; GFX11-NEXT: v_rsq_f16_e32 v1, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract half @llvm.sqrt.f16(half %a)
@@ -4689,7 +4689,7 @@ define half @v_rsq_f16_missing_contract0(half %a) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_rcp_f16_e32 v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sqrt = call half @llvm.sqrt.f16(half %a)
@@ -4761,7 +4761,7 @@ define half @v_rsq_f16_missing_contract1(half %a) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_rcp_f16_e32 v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract half @llvm.sqrt.f16(half %a)
@@ -4833,7 +4833,7 @@ define half @v_neg_rsq_f16_missing_contract0(half %a) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_rcp_f16_e64 v0, -v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sqrt = call half @llvm.sqrt.f16(half %a)
@@ -4905,7 +4905,7 @@ define half @v_neg_rsq_f16_missing_contract1(half %a) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_rcp_f16_e64 v0, -v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract half @llvm.sqrt.f16(half %a)
@@ -4977,7 +4977,7 @@ define half @v_neg_rsq_f16_fabs(half %a) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_sqrt_f16_e64 v0, |v0|
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_rcp_f16_e64 v0, -v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call half @llvm.fabs.f16(half %a)
@@ -5112,7 +5112,7 @@ define half @v_neg_rsq_f16_arcp(half %a) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_rcp_f16_e64 v0, -v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract half @llvm.sqrt.f16(half %a)
@@ -5178,7 +5178,7 @@ define half @v_rsq_f16_afn_nocontract(half %a) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_rcp_f16_e32 v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sqrt = call half @llvm.sqrt.f16(half %a)
@@ -5452,12 +5452,12 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
; GFX11-NEXT: v_mov_b32_e32 v4, 1.0
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_mix_f32 v5, -v0, v2, v4 op_sel_hi:[1,0,0]
; GFX11-NEXT: v_fma_mix_f32 v6, -v1, v3, v4 op_sel_hi:[1,0,0]
; GFX11-NEXT: v_fma_f32 v5, v5, v2, v2
@@ -5745,12 +5745,12 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
; GFX11-NEXT: v_mov_b32_e32 v4, -1.0
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
; GFX11-NEXT: v_fma_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0]
; GFX11-NEXT: v_fma_f32 v5, v5, v2, -v2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
index 3ea918e..b54cf77 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
@@ -161,7 +161,7 @@ define float @v_fdiv_f32(float %a, float %b) {
; GFX11-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0
; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -183,7 +183,7 @@ define float @v_fdiv_f32(float %a, float %b) {
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-FLUSH-NEXT: s_denorm_mode 3
-; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -363,7 +363,7 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 {
; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-IEEE-NEXT: s_denorm_mode 15
-; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3
@@ -387,7 +387,7 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 {
; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-FLUSH-NEXT: s_denorm_mode 3
-; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
@@ -425,7 +425,7 @@ define float @v_fdiv_f32_afn(float %a, float %b) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_rcp_f32_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv afn float %a, %b
@@ -498,7 +498,7 @@ define float @v_fdiv_f32_ulp25(float %a, float %b) {
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-IEEE-NEXT: v_sub_nc_u32_e32 v0, v0, v1
-; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_ldexp_f32 v0, v2, v0
@@ -513,7 +513,7 @@ define float @v_fdiv_f32_ulp25(float %a, float %b) {
; GFX11-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v2
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
-; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX11-FLUSH-NEXT: v_mul_f32_e32 v0, v2, v0
; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31]
@@ -625,7 +625,7 @@ define float @v_fdiv_f32_dynamic_25ulp(float %x, float %y) #0 {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_sub_nc_u32_e32 v0, v0, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_ldexp_f32 v0, v2, v0
@@ -782,7 +782,7 @@ define float @v_rcp_f32(float %x) {
; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v2, v1
-; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-IEEE-NEXT: v_fma_f32 v3, -v1, v2, 1.0
; GFX11-IEEE-NEXT: v_fmac_f32_e32 v2, v3, v2
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -804,7 +804,7 @@ define float @v_rcp_f32(float %x) {
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-FLUSH-NEXT: s_denorm_mode 3
-; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -966,7 +966,7 @@ define float @v_rcp_f32_arcp(float %x) {
; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v2, v1
-; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-IEEE-NEXT: v_fma_f32 v3, -v1, v2, 1.0
; GFX11-IEEE-NEXT: v_fmac_f32_e32 v2, v3, v2
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -988,7 +988,7 @@ define float @v_rcp_f32_arcp(float %x) {
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-FLUSH-NEXT: s_denorm_mode 3
-; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1072,7 +1072,7 @@ define float @v_rcp_f32_ulp25(float %x) {
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v1, v1
; GFX11-IEEE-NEXT: v_sub_nc_u32_e32 v0, 0, v0
-; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-IEEE-NEXT: v_ldexp_f32 v0, v1, v0
; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -1104,7 +1104,7 @@ define float @v_fdiv_f32_afn_ulp25(float %a, float %b) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_rcp_f32_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv afn float %a, %b, !fpmath !0
@@ -1159,7 +1159,7 @@ define float @v_fdiv_f32_arcp_ulp25(float %a, float %b) {
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-IEEE-NEXT: v_sub_nc_u32_e32 v1, 0, v1
-; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-IEEE-NEXT: v_ldexp_f32 v1, v2, v1
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1
@@ -1169,7 +1169,7 @@ define float @v_fdiv_f32_arcp_ulp25(float %a, float %b) {
; GFX11-FLUSH: ; %bb.0:
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
-; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp float %a, %b, !fpmath !0
@@ -1421,7 +1421,7 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v6, v4
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v7, v5
-; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0
; GFX11-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -1456,7 +1456,7 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v5, v4
; GFX11-FLUSH-NEXT: s_denorm_mode 3
-; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v7, v5
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1474,7 +1474,7 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0
; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, v1, v3, v1
; GFX11-FLUSH-NEXT: s_denorm_mode 3
-; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FLUSH-NEXT: v_fma_f32 v4, -v6, v5, 1.0
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v4, v5
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1516,7 +1516,7 @@ define <2 x float> @v_fdiv_v2f32_afn(<2 x float> %a, <2 x float> %b) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv afn <2 x float> %a, %b
@@ -1627,7 +1627,7 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) {
; GFX11-IEEE-NEXT: v_sub_nc_u32_e32 v2, v6, v2
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_sub_nc_u32_e32 v3, v7, v3
-; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-IEEE-NEXT: v_dual_mul_f32 v0, v0, v4 :: v_dual_mul_f32 v1, v1, v5
; GFX11-IEEE-NEXT: v_ldexp_f32 v0, v0, v2
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -1647,7 +1647,7 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) {
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FLUSH-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
; GFX11-FLUSH-NEXT: v_dual_mul_f32 v0, v4, v0 :: v_dual_mul_f32 v1, v5, v1
; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31]
@@ -1900,7 +1900,7 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0
; GFX11-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -1935,7 +1935,7 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-FLUSH-NEXT: s_denorm_mode 3
-; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1953,7 +1953,7 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, v1, 1.0
; GFX11-FLUSH-NEXT: s_denorm_mode 3
-; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v4, v3, 1.0
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2216,7 +2216,7 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0
; GFX11-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -2251,7 +2251,7 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-FLUSH-NEXT: s_denorm_mode 3
-; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2269,7 +2269,7 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, v1, 1.0
; GFX11-FLUSH-NEXT: s_denorm_mode 3
-; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v4, v3, 1.0
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2375,7 +2375,7 @@ define <2 x float> @v_rcp_v2f32_ulp25(<2 x float> %x) {
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-IEEE-NEXT: v_sub_nc_u32_e32 v0, 0, v0
; GFX11-IEEE-NEXT: v_sub_nc_u32_e32 v1, 0, v1
-; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-IEEE-NEXT: v_ldexp_f32 v0, v2, v0
; GFX11-IEEE-NEXT: v_ldexp_f32 v1, v3, v1
; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31]
@@ -2414,7 +2414,7 @@ define <2 x float> @v_fdiv_v2f32_afn_ulp25(<2 x float> %a, <2 x float> %b) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv afn <2 x float> %a, %b, !fpmath !0
@@ -2492,7 +2492,7 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) {
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-IEEE-NEXT: v_sub_nc_u32_e32 v2, 0, v2
; GFX11-IEEE-NEXT: v_sub_nc_u32_e32 v3, 0, v3
-; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-IEEE-NEXT: v_ldexp_f32 v2, v4, v2
; GFX11-IEEE-NEXT: v_ldexp_f32 v3, v5, v3
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -2504,7 +2504,7 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) {
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FLUSH-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0
@@ -2535,7 +2535,7 @@ define <2 x float> @v_fdiv_v2f32_arcp_afn_ulp25(<2 x float> %a, <2 x float> %b)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv afn arcp <2 x float> %a, %b, !fpmath !0
@@ -2768,7 +2768,7 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 {
; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-IEEE-NEXT: s_denorm_mode 15
-; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3
@@ -2792,7 +2792,7 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 {
; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-FLUSH-NEXT: s_denorm_mode 3
-; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
@@ -2919,7 +2919,7 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf(float %x, float %y, float %z)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_sub_nc_u32_e32 v0, v0, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_ldexp_f32 v0, v2, v0
@@ -3042,7 +3042,7 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user(float %x, fl
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: v_sub_nc_u32_e32 v0, v0, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_ldexp_f32 v0, v3, v0
@@ -3293,7 +3293,7 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #0 {
; GFX11-IEEE-NEXT: v_div_scale_f32 v2, vcc_lo, v2, v1, v2
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v4, v3
; GFX11-IEEE-NEXT: s_denorm_mode 15
-; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v4
@@ -3319,7 +3319,7 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #0 {
; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, v2, v1, v2
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
; GFX11-FLUSH-NEXT: s_denorm_mode 3
-; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v3, v4, 1.0
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v4
@@ -3447,7 +3447,7 @@ define float @v_fdiv_neglhs_f32_dynamic_25ulp(float %x, float %y) #0 {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_sub_nc_u32_e32 v0, v0, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_ldexp_f32 v0, v2, v0
@@ -3697,7 +3697,7 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #0 {
; GFX11-IEEE-NEXT: v_div_scale_f32 v2, vcc_lo, v0, v2, v0
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v4, v3
; GFX11-IEEE-NEXT: s_denorm_mode 15
-; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v4
@@ -3723,7 +3723,7 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #0 {
; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, v0, v2, v0
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
; GFX11-FLUSH-NEXT: s_denorm_mode 3
-; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v3, v4, 1.0
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v4
@@ -3983,7 +3983,7 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 {
; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-IEEE-NEXT: s_denorm_mode 15
-; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_fmac_f32_e32 v2, v4, v2
@@ -4007,7 +4007,7 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 {
; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-FLUSH-NEXT: s_denorm_mode 3
-; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2
@@ -4112,7 +4112,7 @@ define float @v_fdiv_f32_constrhs0_dynamic_25ulp(float %x) #0 {
; GFX11-NEXT: v_rcp_f32_e32 v1, 0x3f40e400
; GFX11-NEXT: v_frexp_mant_f32_e32 v2, v0
; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_dual_mul_f32 v1, v2, v1 :: v_dual_add_nc_u32 v0, -14, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_ldexp_f32 v0, v1, v0
@@ -4361,7 +4361,7 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 {
; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-IEEE-NEXT: s_denorm_mode 15
-; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_fmac_f32_e32 v2, v4, v2
@@ -4385,7 +4385,7 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 {
; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-FLUSH-NEXT: s_denorm_mode 3
-; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2
@@ -4495,7 +4495,7 @@ define float @v_fdiv_f32_constlhs0_dynamic_25ulp(float %x) #0 {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_f32_e32 v1, v1
; GFX11-NEXT: v_sub_nc_u32_e32 v0, 14, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v1, 0x3f40e400, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_ldexp_f32 v0, v1, v0
@@ -4734,7 +4734,7 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-IEEE-NEXT: s_denorm_mode 15
-; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3
@@ -4758,7 +4758,7 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-FLUSH-NEXT: s_denorm_mode 3
-; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
@@ -4885,7 +4885,7 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_x(float nofpclass(sub) %x, float
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_sub_nc_u32_e32 v0, v0, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_ldexp_f32 v0, v2, v0
@@ -5123,7 +5123,7 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-IEEE-NEXT: s_denorm_mode 15
-; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3
@@ -5147,7 +5147,7 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-FLUSH-NEXT: s_denorm_mode 3
-; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
@@ -5274,7 +5274,7 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_y(float %x, float nofpclass(sub)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_sub_nc_u32_e32 v0, v0, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_ldexp_f32 v0, v2, v0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
index ea149cc..31fc1ee 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
@@ -89,7 +89,7 @@ define double @v_fdiv_f64(double %a, double %b) {
; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -137,7 +137,7 @@ define double @v_fdiv_f64_afn(double %a, double %b) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
@@ -228,7 +228,7 @@ define double @v_fdiv_f64_ulp25(double %a, double %b) {
; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -321,7 +321,7 @@ define double @v_rcp_f64(double %x) {
; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; GFX11-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -414,7 +414,7 @@ define double @v_rcp_f64_arcp(double %x) {
; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; GFX11-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -460,7 +460,7 @@ define double @v_rcp_f64_arcp_afn(double %x) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
@@ -551,7 +551,7 @@ define double @v_rcp_f64_ulp25(double %x) {
; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; GFX11-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -599,7 +599,7 @@ define double @v_fdiv_f64_afn_ulp25(double %a, double %b) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
@@ -690,7 +690,7 @@ define double @v_fdiv_f64_arcp_ulp25(double %a, double %b) {
; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -835,7 +835,7 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_rcp_f64_e32 v[12:13], v[8:9]
; GFX11-NEXT: v_rcp_f64_e32 v[14:15], v[10:11]
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
; GFX11-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -914,7 +914,7 @@ define <2 x double> @v_fdiv_v2f64_afn(<2 x double> %a, <2 x double> %b) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[4:5]
; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[6:7]
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -1068,7 +1068,7 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_rcp_f64_e32 v[12:13], v[8:9]
; GFX11-NEXT: v_rcp_f64_e32 v[14:15], v[10:11]
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
; GFX11-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -1228,7 +1228,7 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[4:5]
; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[6:7]
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -1388,7 +1388,7 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[4:5]
; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[6:7]
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -1463,7 +1463,7 @@ define <2 x double> @v_rcp_v2f64_arcp_afn(<2 x double> %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[0:1]
; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[2:3]
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
; GFX11-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -1615,7 +1615,7 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[4:5]
; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[6:7]
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -1694,7 +1694,7 @@ define <2 x double> @v_fdiv_v2f64_afn_ulp25(<2 x double> %a, <2 x double> %b) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[4:5]
; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[6:7]
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -1848,7 +1848,7 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_rcp_f64_e32 v[12:13], v[8:9]
; GFX11-NEXT: v_rcp_f64_e32 v[14:15], v[10:11]
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
; GFX11-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -1927,7 +1927,7 @@ define <2 x double> @v_fdiv_v2f64_arcp_afn_ulp25(<2 x double> %a, <2 x double> %
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[4:5]
; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[6:7]
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 2356dad..13d6597 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -2848,7 +2848,7 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
; GFX12-NEXT: s_mov_b32 s2, 3
; GFX12-NEXT: s_mov_b32 s1, 2
; GFX12-NEXT: s_mov_b32 s0, 1
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1
; GFX12-NEXT: v_mov_b32_e32 v1, s0
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -3229,7 +3229,7 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
; GFX12-NEXT: s_mov_b32 s2, 3
; GFX12-NEXT: s_mov_b32 s1, 2
; GFX12-NEXT: s_mov_b32 s0, 1
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
; GFX12-NEXT: s_wait_storecnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.ll
new file mode 100644
index 0000000..84ac58f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.ll
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-FAKE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-TRUE16 %s
+
+define amdgpu_ps half @fmul_s16_uniform(half inreg %a, half inreg %b) {
+; GFX11-FAKE16-LABEL: fmul_s16_uniform:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: v_mul_f16_e64 v0, s0, s1
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: fmul_s16_uniform:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, s0, s1
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: fmul_s16_uniform:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_mul_f16 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %result = fmul half %a, %b
+ ret half %result
+}
+
+define amdgpu_ps half @fmul_s16_div(half %a, half %b) {
+; GFX11-FAKE16-LABEL: fmul_s16_div:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: fmul_s16_div:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: fmul_s16_div:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX12-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: fmul_s16_div:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-TRUE16-NEXT: ; return to shader part epilog
+ %result = fmul half %a, %b
+ ret half %result
+}
+
+define amdgpu_ps float @fmul_s32_uniform(float inreg %a, float inreg %b) {
+; GFX11-LABEL: fmul_s32_uniform:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mul_f32_e64 v0, s0, s1
+; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: fmul_s32_uniform:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_mul_f32 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %result = fmul float %a, %b
+ ret float %result
+}
+
+define amdgpu_ps float @fmul_s32_div(float %a, float %b) {
+; GCN-LABEL: fmul_s32_div:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT: ; return to shader part epilog
+ %result = fmul float %a, %b
+ ret float %result
+}
+
+define amdgpu_ps void @fmul_s64_uniform(double inreg %a, double inreg %b, ptr addrspace(1) %ptr) {
+; GFX11-LABEL: fmul_s64_uniform:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mul_f64 v[2:3], s[0:1], s[2:3]
+; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: fmul_s64_uniform:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mul_f64_e64 v[2:3], s[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT: s_endpgm
+ %result = fmul double %a, %b
+ store double %result, ptr addrspace(1) %ptr
+ ret void
+}
+
+define amdgpu_ps void @fmul_s64_div(double %a, double %b, ptr addrspace(1) %ptr) {
+; GFX11-LABEL: fmul_s64_div:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT: global_store_b64 v[4:5], v[0:1], off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: fmul_s64_div:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mul_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: global_store_b64 v[4:5], v[0:1], off
+; GFX12-NEXT: s_endpgm
+ %result = fmul double %a, %b
+ store double %result, ptr addrspace(1) %ptr
+ ret void
+}
+
+define amdgpu_ps <2 x half> @fmul_v2s16_uniform(<2 x half> inreg %a, <2 x half> inreg %b) {
+; GFX11-LABEL: fmul_v2s16_uniform:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_pk_mul_f16 v0, s0, s1
+; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: fmul_v2s16_uniform:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_lshr_b32 s2, s0, 16
+; GFX12-NEXT: s_lshr_b32 s3, s1, 16
+; GFX12-NEXT: s_mul_f16 s0, s0, s1
+; GFX12-NEXT: s_mul_f16 s1, s2, s3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %result = fmul <2 x half> %a, %b
+ ret <2 x half> %result
+}
+
+define amdgpu_ps <2 x half> @fmul_v2s16_div(<2 x half> %a, <2 x half> %b) {
+; GCN-LABEL: fmul_v2s16_div:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_mul_f16 v0, v0, v1
+; GCN-NEXT: ; return to shader part epilog
+ %result = fmul <2 x half> %a, %b
+ ret <2 x half> %result
+}
+
+define amdgpu_ps <2 x float> @fmul_v2s32_uniform(<2 x float> inreg %a, <2 x float> inreg %b) {
+; GFX11-LABEL: fmul_v2s32_uniform:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mul_f32_e64 v0, s0, s2
+; GFX11-NEXT: v_mul_f32_e64 v1, s1, s3
+; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: fmul_v2s32_uniform:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_mul_f32 s0, s0, s2
+; GFX12-NEXT: s_mul_f32 s1, s1, s3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: ; return to shader part epilog
+ %result = fmul <2 x float> %a, %b
+ ret <2 x float> %result
+}
+
+define amdgpu_ps <2 x float> @fmul_v2s32_div(<2 x float> %a, <2 x float> %b) {
+; GCN-LABEL: fmul_v2s32_div:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GCN-NEXT: ; return to shader part epilog
+ %result = fmul <2 x float> %a, %b
+ ret <2 x float> %result
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
index e03aa18..1220c0e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
@@ -4,6 +4,8 @@
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX10 %s
+; TODO: Switch test to use -new-reg-bank-select after adding G_FNEG support.
+
define <2 x half> @v_fmul_v2f16(<2 x half> %a, <2 x half> %b) {
; GFX9-LABEL: v_fmul_v2f16:
; GFX9: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll
new file mode 100644
index 0000000..ebc28cb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll
@@ -0,0 +1,303 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12 %s
+
+define amdgpu_ps void @v_fneg_f16(half %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fneg_f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GCN-NEXT: global_store_b16 v[1:2], v0, off
+; GCN-NEXT: s_endpgm
+ %fneg = fneg half %in
+ store half %fneg, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fneg_f16(half inreg %in, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fneg_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_xor_b32_e64 v2, 0x8000, s0
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fneg_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_xor_b32 s0, s0, 0x8000
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %fneg = fneg half %in
+ store half %fneg, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fneg_f16_salu_use(half inreg %in, i32 inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fneg_f16_salu_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_xor_b32_e64 v2, 0x8000, s0
+; GFX11-NEXT: s_cmp_eq_u32 s1, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: s_cselect_b32 s0, s0, 0
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fneg_f16_salu_use:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_xor_b32 s0, s0, 0x8000
+; GFX12-NEXT: s_cmp_eq_u32 s1, 0
+; GFX12-NEXT: s_cselect_b32 s0, s0, 0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %fneg = fneg half %in
+ %cond = icmp eq i32 %val, 0
+ %sel = select i1 %cond, half %fneg, half 0.0
+ store half %sel, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_fneg_f32(float %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fneg_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: global_store_b32 v[1:2], v0, off
+; GCN-NEXT: s_endpgm
+ %fneg = fneg float %in
+ store float %fneg, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fneg_f32(float inreg %in, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fneg_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_xor_b32_e64 v2, 0x80000000, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fneg_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_xor_b32 s0, s0, 0x80000000
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %fneg = fneg float %in
+ store float %fneg, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fneg_f32_salu_use(float inreg %in, i32 inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fneg_f32_salu_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_xor_b32_e64 v2, 0x80000000, s0
+; GFX11-NEXT: s_cmp_eq_u32 s1, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: s_cselect_b32 s0, s0, 0
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fneg_f32_salu_use:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_xor_b32 s0, s0, 0x80000000
+; GFX12-NEXT: s_cmp_eq_u32 s1, 0
+; GFX12-NEXT: s_cselect_b32 s0, s0, 0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %fneg = fneg float %in
+ %cond = icmp eq i32 %val, 0
+ %sel = select i1 %cond, float %fneg, float 0.0
+ store float %sel, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_fneg_f64(double %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fneg_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GCN-NEXT: s_endpgm
+ %fneg = fneg double %in
+ store double %fneg, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fneg_f64(double inreg %in, ptr addrspace(1) %out) {
+; GCN-LABEL: s_fneg_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
+; GCN-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GCN-NEXT: s_endpgm
+ %fneg = fneg double %in
+ store double %fneg, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fneg_f64_salu_use(double inreg %in, i32 inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fneg_f64_salu_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: s_cmp_eq_u32 s2, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_readfirstlane_b32 s1, v3
+; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], 0
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fneg_f64_salu_use:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: s_cmp_eq_u32 s2, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
+; GFX12-NEXT: v_readfirstlane_b32 s0, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_readfirstlane_b32 s1, v3
+; GFX12-NEXT: s_cselect_b64 s[0:1], s[0:1], 0
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT: s_endpgm
+ %fneg = fneg double %in
+ %cond = icmp eq i32 %val, 0
+ %sel = select i1 %cond, double %fneg, double 0.0
+ store double %sel, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_fneg_v2f16(<2 x half> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fneg_v2f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
+; GCN-NEXT: global_store_b32 v[1:2], v0, off
+; GCN-NEXT: s_endpgm
+ %fneg = fneg <2 x half> %in
+ store <2 x half> %fneg, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fneg_v2f16(<2 x half> inreg %in, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fneg_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_xor_b32_e64 v2, 0x80008000, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fneg_v2f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_lshr_b32 s1, s0, 16
+; GFX12-NEXT: s_xor_b32 s0, s0, 0x8000
+; GFX12-NEXT: s_xor_b32 s1, s1, 0x8000
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %fneg = fneg <2 x half> %in
+ store <2 x half> %fneg, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fneg_v2f16_salu_use(<2 x half> inreg %in, i32 inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fneg_v2f16_salu_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_xor_b32_e64 v2, 0x80008000, s0
+; GFX11-NEXT: s_cmp_eq_u32 s1, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: s_cselect_b32 s0, s0, 0
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fneg_v2f16_salu_use:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_lshr_b32 s2, s0, 16
+; GFX12-NEXT: s_xor_b32 s0, s0, 0x8000
+; GFX12-NEXT: s_xor_b32 s2, s2, 0x8000
+; GFX12-NEXT: s_cmp_eq_u32 s1, 0
+; GFX12-NEXT: s_pack_ll_b32_b16 s0, s0, s2
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_cselect_b32 s0, s0, 0
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %fneg = fneg <2 x half> %in
+ %cond = icmp eq i32 %val, 0
+ %sel = select i1 %cond, <2 x half> %fneg, <2 x half> <half 0.0, half 0.0>
+ store <2 x half> %sel, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_fneg_v2f32(<2 x float> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fneg_v2f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GCN-NEXT: s_endpgm
+ %fneg = fneg <2 x float> %in
+ store <2 x float> %fneg, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fneg_v2f32(<2 x float> inreg %in, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fneg_v2f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_xor_b32_e64 v2, 0x80000000, s0
+; GFX11-NEXT: v_xor_b32_e64 v3, 0x80000000, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: v_readfirstlane_b32 s1, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fneg_v2f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_xor_b32 s0, s0, 0x80000000
+; GFX12-NEXT: s_xor_b32 s1, s1, 0x80000000
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT: s_endpgm
+ %fneg = fneg <2 x float> %in
+ store <2 x float> %fneg, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fneg_v2f32_salu_use(<2 x float> inreg %in, i32 inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fneg_v2f32_salu_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_xor_b32_e64 v2, 0x80000000, s0
+; GFX11-NEXT: v_xor_b32_e64 v3, 0x80000000, s1
+; GFX11-NEXT: s_cmp_eq_u32 s2, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: v_readfirstlane_b32 s1, v3
+; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fneg_v2f32_salu_use:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_xor_b32 s0, s0, 0x80000000
+; GFX12-NEXT: s_xor_b32 s1, s1, 0x80000000
+; GFX12-NEXT: s_cmp_eq_u32 s2, 0
+; GFX12-NEXT: s_cselect_b64 s[0:1], s[0:1], 0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT: s_endpgm
+ %fneg = fneg <2 x float> %in
+ %cond = icmp eq i32 %val, 0
+ %sel = select i1 %cond, <2 x float> %fneg, <2 x float> <float 0.0, float 0.0>
+ store <2 x float> %sel, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 7e297f4..5858612 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -41,6 +41,7 @@ define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, doub
;
; GFX1250-LABEL: raw_buffer_atomic_add_noret_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -71,6 +72,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, doub
;
; GFX1250-LABEL: raw_buffer_atomic_add_rtn_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
@@ -114,6 +116,7 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
;
; GFX1250-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -160,6 +163,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8)
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_add_noret_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -190,6 +194,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inreg
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_add_rtn_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
@@ -233,6 +238,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -279,6 +285,7 @@ define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, d
;
; GFX1250-LABEL: struct_buffer_atomic_add_noret_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -309,6 +316,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, d
;
; GFX1250-LABEL: struct_buffer_atomic_add_rtn_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
@@ -352,6 +360,7 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
;
; GFX1250-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -397,6 +406,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_add_noret_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -427,6 +437,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inr
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_add_rtn_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
@@ -470,6 +481,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -515,6 +527,7 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, doub
;
; GFX1250-LABEL: raw_buffer_atomic_min_noret_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -545,6 +558,7 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, doub
;
; GFX1250-LABEL: raw_buffer_atomic_min_rtn_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
@@ -588,6 +602,7 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
;
; GFX1250-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -634,6 +649,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8)
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -664,6 +680,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inreg
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_min_rtn_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
@@ -707,6 +724,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -753,6 +771,7 @@ define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, d
;
; GFX1250-LABEL: struct_buffer_atomic_min_noret_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -783,6 +802,7 @@ define amdgpu_ps void @struct_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, d
;
; GFX1250-LABEL: struct_buffer_atomic_min_rtn_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
@@ -826,6 +846,7 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
;
; GFX1250-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -871,6 +892,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_min_noret_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -901,6 +923,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inr
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_min_rtn_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
@@ -944,6 +967,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -989,6 +1013,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, doub
;
; GFX1250-LABEL: raw_buffer_atomic_max_noret_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -1019,6 +1044,7 @@ define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, doub
;
; GFX1250-LABEL: raw_buffer_atomic_max_rtn_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
@@ -1062,6 +1088,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
;
; GFX1250-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -1108,6 +1135,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8)
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -1138,6 +1166,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inreg
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_max_rtn_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
@@ -1181,6 +1210,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -1227,6 +1257,7 @@ define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, d
;
; GFX1250-LABEL: struct_buffer_atomic_max_noret_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -1257,6 +1288,7 @@ define amdgpu_ps void @struct_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, d
;
; GFX1250-LABEL: struct_buffer_atomic_max_rtn_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
@@ -1300,6 +1332,7 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
;
; GFX1250-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -1345,6 +1378,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_max_noret_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -1375,6 +1409,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inr
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_max_rtn_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
@@ -1418,6 +1453,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -1486,6 +1522,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
;
; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-NEXT: s_mov_b32 s1, exec_lo
; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
@@ -1501,7 +1538,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1559,6 +1595,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
;
; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_agent:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-NEXT: s_mov_b32 s1, exec_lo
; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
@@ -1574,7 +1611,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1634,6 +1670,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
;
; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_system:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-NEXT: s_mov_b32 s1, exec_lo
; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
@@ -1649,7 +1686,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1707,6 +1743,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
;
; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_flush:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-NEXT: s_mov_b32 s1, exec_lo
; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
@@ -1722,7 +1759,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1898,6 +1934,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
;
; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-NEXT: s_mov_b32 s1, exec_lo
; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
@@ -1913,7 +1950,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1954,12 +1990,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
;
; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1997,12 +2033,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
;
; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2042,12 +2078,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
;
; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_system:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -2205,12 +2241,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
;
; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2264,6 +2300,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
;
; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-NEXT: s_mov_b32 s1, exec_lo
; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
@@ -2329,6 +2366,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
;
; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-NEXT: s_mov_b32 s1, exec_lo
; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
@@ -2394,6 +2432,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
;
; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-NEXT: s_mov_b32 s1, exec_lo
; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
index 99261cc..98a26a4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
@@ -109,7 +109,7 @@ define float @v_pow_f32(float %x, float %y) {
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_log_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -119,7 +119,7 @@ define float @v_pow_f32(float %x, float %y) {
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%pow = call float @llvm.pow.f32(float %x, float %y)
@@ -287,7 +287,7 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
; GFX11-NEXT: v_ldexp_f32 v1, v1, v5
; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s0
; GFX11-NEXT: v_log_f32_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_dual_sub_f32 v1, v1, v5 :: v_dual_lshlrev_b32 v4, 5, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_ldexp_f32 v0, v0, v4
@@ -298,7 +298,7 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
; GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, s0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v3
; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v2
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
@@ -306,13 +306,13 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
; GFX11-NEXT: v_exp_f32_e32 v1, v1
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_ldexp_f32 v1, v1, v3
; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_ldexp_f32 v0, v0, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%pow = call <2 x float> @llvm.pow.v2f32(<2 x float> %x, <2 x float> %y)
@@ -377,7 +377,7 @@ define half @v_pow_f16(half %x, half %y) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l
-; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1
@@ -391,7 +391,7 @@ define half @v_pow_f16(half %x, half %y) {
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_log_f16_e32 v0, v0
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1
@@ -496,7 +496,7 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.h, v0.h
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v1.l
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -519,7 +519,7 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_log_f16_e32 v0, v0
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -530,7 +530,7 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
; GFX11-FAKE16-NEXT: v_exp_f16_e32 v1, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_exp_f16_e32 v0, v0
-; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
@@ -641,7 +641,7 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.h, v0.h
-; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -666,7 +666,7 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_log_f16_e32 v0, v0
-; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-FAKE16-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3
@@ -676,7 +676,7 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_exp_f16_e32 v1, v1
; GFX11-FAKE16-NEXT: v_exp_f16_e32 v0, v0
-; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
@@ -789,7 +789,7 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v1.l
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -812,7 +812,7 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
; GFX11-FAKE16-NEXT: v_log_f16_e32 v0, v0
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -824,7 +824,7 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
; GFX11-FAKE16-NEXT: v_exp_f16_e32 v1, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_exp_f16_e32 v0, v0
-; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
@@ -946,7 +946,7 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v1.l
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -972,7 +972,7 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_log_f16_e32 v0, v0
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -983,7 +983,7 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
; GFX11-FAKE16-NEXT: v_exp_f16_e32 v1, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_exp_f16_e32 v0, v0
-; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
@@ -1102,7 +1102,7 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
; GFX11-NEXT: v_ldexp_f32 v0, |v0|, v2
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s0
; GFX11-NEXT: v_log_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1
@@ -1112,7 +1112,7 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fabs.x = call float @llvm.fabs.f32(float %x)
@@ -1223,7 +1223,7 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_log_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, v0, |v1|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -1233,7 +1233,7 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fabs.y = call float @llvm.fabs.f32(float %y)
@@ -1344,7 +1344,7 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
; GFX11-NEXT: v_ldexp_f32 v0, |v0|, v2
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s0
; GFX11-NEXT: v_log_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, v0, |v1|
@@ -1354,7 +1354,7 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fabs.x = call float @llvm.fabs.f32(float %x)
@@ -1461,7 +1461,7 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_ldexp_f32 v1, s0, v1
; GFX11-NEXT: v_log_f32_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_sub_f32_e32 v1, v1, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v1, v0
@@ -1471,7 +1471,7 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) {
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: ; return to shader part epilog
%pow = call float @llvm.pow.f32(float %x, float %y)
@@ -1576,7 +1576,7 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) {
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_log_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, s0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -1586,7 +1586,7 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) {
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: ; return to shader part epilog
%pow = call float @llvm.pow.f32(float %x, float %y)
@@ -1691,7 +1691,7 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_ldexp_f32 v0, s0, v0
; GFX11-NEXT: v_log_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, s1, v0
@@ -1701,7 +1701,7 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) {
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: ; return to shader part epilog
%pow = call float @llvm.pow.f32(float %x, float %y)
@@ -1811,7 +1811,7 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
; GFX11-NEXT: v_ldexp_f32 v0, -v0, v2
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s0
; GFX11-NEXT: v_log_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1
@@ -1821,7 +1821,7 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%neg.x = fneg float %x
@@ -1932,7 +1932,7 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_log_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, v0, -v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -1942,7 +1942,7 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%neg.y = fneg float %y
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 715a777..54efb26 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -133,7 +133,7 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_lshr_b32 s1, s1, 1
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
@@ -298,7 +298,7 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v3, v3
; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3
@@ -1560,7 +1560,7 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
; GFX11-NEXT: s_bfe_u32 s1, s1, 0x170001
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1720,7 +1720,7 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
; GFX11-NEXT: v_bfe_u32 v1, v1, 1, 23
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2201,7 +2201,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX11-NEXT: s_and_b32 s5, s5, 0xff
; GFX11-NEXT: s_or_b32 s4, s4, s11
; GFX11-NEXT: s_lshl_b32 s5, s5, 8
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: s_and_b32 s13, s13, 0xff
; GFX11-NEXT: s_or_b32 s5, s12, s5
@@ -2504,7 +2504,7 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
; GFX11-NEXT: v_bfe_u32 v3, v3, 1, 23
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v6, v6
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
; GFX11-NEXT: v_cvt_u32_f32_e32 v6, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr-new-regbank-select.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr-new-regbank-select.ll
new file mode 100644
index 0000000..ec74b88
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr-new-regbank-select.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck %s
+
+define amdgpu_ps void @uniform_fshr_i32(i32 inreg %lhs, i32 inreg %rhs, i32 inreg %amt, ptr %resptr) {
+; CHECK-LABEL: uniform_fshr_i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT: v_alignbit_b32 v2, s0, s1, v2
+; CHECK-NEXT: v_readfirstlane_b32 s0, v2
+; CHECK-NEXT: s_add_co_i32 s0, s0, s0
+; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: flat_store_b32 v[0:1], v2
+; CHECK-NEXT: s_endpgm
+ %vres = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt)
+ %add = add i32 %vres, %vres
+ store i32 %add, ptr %resptr
+ ret void
+}
+
+declare i32 @llvm.amdgcn.readfirstlane.i32(i32)
+
+define amdgpu_ps void @divergent_fshr_i32(i32 %lhs, i32 %rhs, i32 %amt, ptr %resptr) {
+; CHECK-LABEL: divergent_fshr_i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_alignbit_b32 v0, v0, v1, v2
+; CHECK-NEXT: flat_store_b32 v[3:4], v0
+; CHECK-NEXT: s_endpgm
+ %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt)
+ store i32 %result, ptr %resptr
+ ret void
+}
+
+declare i32 @llvm.fshr.i32(i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 5aa5a671..1e762f9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -134,7 +134,7 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
; GFX11-NEXT: s_and_b32 s1, s1, 0x7f
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -299,7 +299,7 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
; GFX11-NEXT: v_and_b32_e32 v1, 0x7f, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1576,7 +1576,7 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
; GFX11-NEXT: s_lshl_b32 s0, s0, 1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1741,7 +1741,7 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2222,7 +2222,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX11-NEXT: s_and_b32 s5, s5, 0xff
; GFX11-NEXT: s_or_b32 s4, s4, s15
; GFX11-NEXT: s_lshl_b32 s5, s5, 8
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: s_and_b32 s17, s17, 0xff
; GFX11-NEXT: s_or_b32 s5, s16, s5
@@ -2531,7 +2531,7 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: v_rcp_iflag_f32_e32 v6, v6
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_dual_mul_f32 v6, 0x4f7ffffe, v6 :: v_dual_lshlrev_b32 v1, 1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_u32_f32_e32 v6, v6
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value-addrspaces.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value-addrspaces.ll
new file mode 100644
index 0000000..cf9524b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value-addrspaces.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
+
+@flat = external global i32, align 4
+@global = external addrspace(1) global i32, align 4
+@lds = addrspace(3) global i32 poison, align 4
+@constant = external addrspace(4) constant i32, align 4
+@buf = external addrspace(8) global i8
+
+define ptr @global_value_as0_external() {
+; GCN-LABEL: global_value_as0_external:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, flat@gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, flat@gotpcrel32@hi+12
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ ret ptr @flat
+}
+
+define ptr addrspace(1) @global_value_as1_external() {
+; GCN-LABEL: global_value_as1_external:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, global@gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, global@gotpcrel32@hi+12
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ ret ptr addrspace(1) @global
+}
+
+define ptr addrspace(4) @global_value_as4_external() {
+; GCN-LABEL: global_value_as4_external:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, constant@gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, constant@gotpcrel32@hi+12
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ ret ptr addrspace(4) @constant
+}
+
+define amdgpu_kernel void @global_value_as3_lds_kernel(ptr addrspace(1) %out) {
+; GCN-LABEL: global_value_as3_lds_kernel:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_store_dword v0, v0, s[0:1]
+; GCN-NEXT: s_endpgm
+ %addr = ptrtoint ptr addrspace(3) @lds to i32
+ store i32 %addr, ptr addrspace(1) %out
+ ret void
+}
+
+define void @global_value_as8_buffer_store(i32 %val) {
+; GCN-LABEL: global_value_as8_buffer_store:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_getpc_b64 s[8:9]
+; GCN-NEXT: s_add_u32 s8, s8, buf@gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s9, s9, buf@gotpcrel32@hi+12
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %val, ptr addrspace(8) @buf, i32 0, i32 0, i32 0)
+ ret void
+}
+
+define i32 @global_value_as8_buffer_load(i32 %offset) {
+; GCN-LABEL: global_value_as8_buffer_load:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_getpc_b64 s[8:9]
+; GCN-NEXT: s_add_u32 s8, s8, buf@gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s9, s9, buf@gotpcrel32@hi+12
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %val = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) @buf, i32 %offset, i32 0, i32 0)
+ ret i32 %val
+}
+
+declare void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32, ptr addrspace(8) nocapture writeonly, i32, i32, i32 immarg) #0
+declare i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) nocapture readonly, i32, i32, i32 immarg) #1
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
index 9539ec4..91ee764 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
@@ -11,28 +11,40 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX8V4-LABEL: addrspacecast:
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x40
; GFX8V4-NEXT: s_add_i32 s12, s12, s17
; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8V4-NEXT: s_add_u32 s2, s6, 0x44
+; GFX8V4-NEXT: s_addc_u32 s3, s7, 0
+; GFX8V4-NEXT: v_mov_b32_e32 v0, s2
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8V4-NEXT: s_mov_b32 s4, s0
-; GFX8V4-NEXT: s_mov_b32 s5, s3
; GFX8V4-NEXT: s_cmp_lg_u32 s0, -1
-; GFX8V4-NEXT: s_cselect_b64 s[4:5], s[4:5], 0
-; GFX8V4-NEXT: s_mov_b32 s6, s1
-; GFX8V4-NEXT: s_mov_b32 s7, s2
+; GFX8V4-NEXT: v_mov_b32_e32 v1, s3
+; GFX8V4-NEXT: s_cselect_b32 s2, 1, 0
+; GFX8V4-NEXT: s_and_b32 s4, 1, s2
+; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8V4-NEXT: s_add_u32 s2, s6, 64
+; GFX8V4-NEXT: flat_load_dword v3, v[0:1]
+; GFX8V4-NEXT: s_addc_u32 s3, s7, 0
+; GFX8V4-NEXT: v_mov_b32_e32 v0, s2
+; GFX8V4-NEXT: v_mov_b32_e32 v1, s3
+; GFX8V4-NEXT: flat_load_dword v4, v[0:1]
; GFX8V4-NEXT: s_cmp_lg_u32 s1, -1
-; GFX8V4-NEXT: v_mov_b32_e32 v0, s4
-; GFX8V4-NEXT: s_cselect_b64 s[0:1], s[6:7], 0
-; GFX8V4-NEXT: v_mov_b32_e32 v2, 1
-; GFX8V4-NEXT: v_mov_b32_e32 v1, s5
-; GFX8V4-NEXT: flat_store_dword v[0:1], v2
-; GFX8V4-NEXT: s_waitcnt vmcnt(0)
; GFX8V4-NEXT: v_mov_b32_e32 v0, s0
-; GFX8V4-NEXT: v_mov_b32_e32 v2, 2
+; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0
+; GFX8V4-NEXT: s_and_b32 s0, 1, s0
; GFX8V4-NEXT: v_mov_b32_e32 v1, s1
-; GFX8V4-NEXT: flat_store_dword v[0:1], v2
+; GFX8V4-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8V4-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
+; GFX8V4-NEXT: v_mov_b32_e32 v5, 1
+; GFX8V4-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX8V4-NEXT: v_cndmask_b32_e64 v2, 0, v1, s[0:1]
+; GFX8V4-NEXT: s_waitcnt vmcnt(1)
+; GFX8V4-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX8V4-NEXT: flat_store_dword v[0:1], v5
+; GFX8V4-NEXT: s_waitcnt vmcnt(0)
+; GFX8V4-NEXT: v_mov_b32_e32 v0, 2
+; GFX8V4-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1]
+; GFX8V4-NEXT: flat_store_dword v[2:3], v0
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
; GFX8V4-NEXT: s_endpgm
;
@@ -124,13 +136,15 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 {
; GFX8V4-LABEL: llvm_amdgcn_is_shared:
; GFX8V4: ; %bb.0:
-; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x40
-; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0
-; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0
+; GFX8V4-NEXT: s_add_u32 s0, s6, 64
+; GFX8V4-NEXT: s_addc_u32 s1, s7, 0
; GFX8V4-NEXT: v_mov_b32_e32 v0, s0
+; GFX8V4-NEXT: v_mov_b32_e32 v1, s1
+; GFX8V4-NEXT: flat_load_dword v0, v[0:1]
+; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8V4-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8V4-NEXT: v_cmp_eq_u32_e32 vcc, s1, v0
+; GFX8V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8V4-NEXT: flat_store_dword v[0:1], v0
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
; GFX8V4-NEXT: s_endpgm
@@ -180,13 +194,15 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 {
define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 {
; GFX8V4-LABEL: llvm_amdgcn_is_private:
; GFX8V4: ; %bb.0:
-; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x44
-; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0
-; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0
+; GFX8V4-NEXT: s_add_u32 s0, s6, 0x44
+; GFX8V4-NEXT: s_addc_u32 s1, s7, 0
; GFX8V4-NEXT: v_mov_b32_e32 v0, s0
+; GFX8V4-NEXT: v_mov_b32_e32 v1, s1
+; GFX8V4-NEXT: flat_load_dword v0, v[0:1]
+; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8V4-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8V4-NEXT: v_cmp_eq_u32_e32 vcc, s1, v0
+; GFX8V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8V4-NEXT: flat_store_dword v[0:1], v0
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
; GFX8V4-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll
index 1a7ccf0..588802c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
-; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
define amdgpu_kernel void @fcmp_uniform_select(float %a, i32 %b, i32 %c, ptr addrspace(1) %out) {
; GFX7-LABEL: fcmp_uniform_select:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir
index 67cc016..b6652f6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir
@@ -1,7 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx700 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX7 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GF8 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx700 -run-pass=instruction-select %s -o - | FileCheck -check-prefixes=GFX7 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass=instruction-select %s -o - | FileCheck -check-prefixes=GF8 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select %s -o - | FileCheck -check-prefixes=GFX11 %s
---
name: test_copy_scc_vcc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll
index d4b485a..3043484 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll
@@ -22,7 +22,7 @@ define amdgpu_cs_chain void @chain_call(<3 x i32> inreg %sgpr, { i32, ptr addrsp
; GFX11-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; GFX11-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee
+ ; GFX11-NEXT: [[GV1:%[0-9]+]]:sgpr_64(p0) = G_GLOBAL_VALUE @callee
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
; GFX11-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
; GFX11-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
@@ -51,7 +51,7 @@ define amdgpu_cs_chain void @chain_call(<3 x i32> inreg %sgpr, { i32, ptr addrsp
; GFX10-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; GFX10-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee
+ ; GFX10-NEXT: [[GV1:%[0-9]+]]:sgpr_64(p0) = G_GLOBAL_VALUE @callee
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
; GFX10-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
@@ -86,7 +86,7 @@ define amdgpu_cs_chain void @chain_preserve_call(<3 x i32> inreg %sgpr, { i32, p
; GFX11-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee_preserve
; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; GFX11-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve
+ ; GFX11-NEXT: [[GV1:%[0-9]+]]:sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
; GFX11-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
; GFX11-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
@@ -115,7 +115,7 @@ define amdgpu_cs_chain void @chain_preserve_call(<3 x i32> inreg %sgpr, { i32, p
; GFX10-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee_preserve
; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; GFX10-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve
+ ; GFX10-NEXT: [[GV1:%[0-9]+]]:sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
; GFX10-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
index 11153bb..333207a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
@@ -10,10 +10,10 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s8), align 8, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s8) from constant-pool, align 8, addrspace 4)
; HSA-VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
; HSA-VI-NEXT: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
@@ -25,10 +25,10 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s8), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s8) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
; LEGACY-MESA-VI-NEXT: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
@@ -45,10 +45,10 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s8), align 8, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s8) from constant-pool, align 8, addrspace 4)
; HSA-VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
; HSA-VI-NEXT: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
@@ -60,10 +60,10 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s8), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s8) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
; LEGACY-MESA-VI-NEXT: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
@@ -80,10 +80,10 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s8), align 8, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s8) from constant-pool, align 8, addrspace 4)
; HSA-VI-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s8)
; HSA-VI-NEXT: G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
@@ -95,10 +95,10 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s8), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s8) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s8)
; LEGACY-MESA-VI-NEXT: G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
@@ -115,10 +115,10 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s16), align 8, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s16) from constant-pool, align 8, addrspace 4)
; HSA-VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16)
; HSA-VI-NEXT: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
@@ -130,10 +130,10 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s16), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s16) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16)
; LEGACY-MESA-VI-NEXT: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
@@ -150,10 +150,10 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s16), align 8, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s16) from constant-pool, align 8, addrspace 4)
; HSA-VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16)
; HSA-VI-NEXT: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
@@ -165,10 +165,10 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s16), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s16) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16)
; LEGACY-MESA-VI-NEXT: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
@@ -185,10 +185,10 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s16), align 8, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s16) from constant-pool, align 8, addrspace 4)
; HSA-VI-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s16)
; HSA-VI-NEXT: G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
@@ -200,10 +200,10 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s16), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s16) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s16)
; LEGACY-MESA-VI-NEXT: G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
@@ -220,10 +220,10 @@ define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nou
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32) from constant-pool, align 8, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
;
@@ -234,10 +234,10 @@ define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nou
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32), addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32) from constant-pool, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
entry:
@@ -253,10 +253,10 @@ define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) n
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32) from constant-pool, align 8, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
;
@@ -267,10 +267,10 @@ define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) n
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32), addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32) from constant-pool, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
entry:
@@ -286,10 +286,10 @@ define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) {
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s8>), align 8, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s8>) from constant-pool, align 8, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD1]](<2 x s8>), [[LOAD]](p1) :: (store (<2 x s8>) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
;
@@ -300,10 +300,10 @@ define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) {
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s8>), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s8>) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](<2 x s8>), [[LOAD]](p1) :: (store (<2 x s8>) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
entry:
@@ -319,10 +319,10 @@ define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) {
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s16>), align 8, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s16>) from constant-pool, align 8, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD1]](<2 x s16>), [[LOAD]](p1) :: (store (<2 x s16>) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
;
@@ -333,10 +333,10 @@ define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) {
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s16>), addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s16>) from constant-pool, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](<2 x s16>), [[LOAD]](p1) :: (store (<2 x s16>) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
entry:
@@ -352,10 +352,10 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32>
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s32>), addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s32>) from constant-pool, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD1]](<2 x s32>), [[LOAD]](p1) :: (store (<2 x s32>) into %ir.out, align 4, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
;
@@ -366,10 +366,10 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32>
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s32>), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s32>) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](<2 x s32>), [[LOAD]](p1) :: (store (<2 x s32>) into %ir.out, align 4, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
entry:
@@ -385,10 +385,10 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s32>), addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s32>) from constant-pool, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD1]](<2 x s32>), [[LOAD]](p1) :: (store (<2 x s32>) into %ir.out, align 4, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
;
@@ -399,10 +399,10 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s32>), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s32>) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](<2 x s32>), [[LOAD]](p1) :: (store (<2 x s32>) into %ir.out, align 4, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
entry:
@@ -418,10 +418,10 @@ define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %i
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s8>), align 8, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s8>) from constant-pool, align 8, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD1]](<3 x s8>), [[LOAD]](p1) :: (store (<3 x s8>) into %ir.out, align 4, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
;
@@ -432,10 +432,10 @@ define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %i
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s8>), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s8>) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](<3 x s8>), [[LOAD]](p1) :: (store (<3 x s8>) into %ir.out, align 4, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
entry:
@@ -451,10 +451,10 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16>
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s16>), align 8, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s16>) from constant-pool, align 8, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD1]](<3 x s16>), [[LOAD]](p1) :: (store (<3 x s16>) into %ir.out, align 4, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
;
@@ -465,10 +465,10 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16>
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s16>), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s16>) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](<3 x s16>), [[LOAD]](p1) :: (store (<3 x s16>) into %ir.out, align 4, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
entry:
@@ -484,10 +484,10 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32>
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s32>), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s32>) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD1]](<3 x s32>), [[LOAD]](p1) :: (store (<3 x s32>) into %ir.out, align 4, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
;
@@ -498,10 +498,10 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32>
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 52
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s32>), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s32>) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](<3 x s32>), [[LOAD]](p1) :: (store (<3 x s32>) into %ir.out, align 4, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
entry:
@@ -517,10 +517,10 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s32>), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s32>) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD1]](<3 x s32>), [[LOAD]](p1) :: (store (<3 x s32>) into %ir.out, align 4, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
;
@@ -531,10 +531,10 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 52
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s32>), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s32>) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](<3 x s32>), [[LOAD]](p1) :: (store (<3 x s32>) into %ir.out, align 4, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
entry:
@@ -550,10 +550,10 @@ define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) {
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s8>), align 8, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s8>) from constant-pool, align 8, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD1]](<4 x s8>), [[LOAD]](p1) :: (store (<4 x s8>) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
;
@@ -564,10 +564,10 @@ define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) {
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s8>), addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s8>) from constant-pool, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](<4 x s8>), [[LOAD]](p1) :: (store (<4 x s8>) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
entry:
@@ -583,10 +583,10 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) {
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s16>), addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s16>) from constant-pool, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD1]](<4 x s16>), [[LOAD]](p1) :: (store (<4 x s16>) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
;
@@ -597,10 +597,10 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) {
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s16>), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s16>) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](<4 x s16>), [[LOAD]](p1) :: (store (<4 x s16>) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
entry:
@@ -616,10 +616,10 @@ define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32>
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s32>), addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s32>) from constant-pool, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD1]](<4 x s32>), [[LOAD]](p1) :: (store (<4 x s32>) into %ir.out, align 4, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
;
@@ -630,10 +630,10 @@ define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32>
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 52
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s32>), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s32>) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](<4 x s32>), [[LOAD]](p1) :: (store (<4 x s32>) into %ir.out, align 4, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
entry:
@@ -649,10 +649,10 @@ define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s32>), addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s32>) from constant-pool, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD1]](<4 x s32>), [[LOAD]](p1) :: (store (<4 x s32>) into %ir.out, align 4, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
;
@@ -663,10 +663,10 @@ define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 52
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s32>), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s32>) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](<4 x s32>), [[LOAD]](p1) :: (store (<4 x s32>) into %ir.out, align 4, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
entry:
@@ -682,10 +682,10 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<8 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s8>), addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<8 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s8>) from constant-pool, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD1]](<8 x s8>), [[LOAD]](p1) :: (store (<8 x s8>) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
;
@@ -696,10 +696,10 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<8 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s8>), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<8 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s8>) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](<8 x s8>), [[LOAD]](p1) :: (store (<8 x s8>) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
entry:
@@ -715,10 +715,10 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) {
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s16>), addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s16>) from constant-pool, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD1]](<8 x s16>), [[LOAD]](p1) :: (store (<8 x s16>) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
;
@@ -729,10 +729,10 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) {
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 52
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s16>), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s16>) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](<8 x s16>), [[LOAD]](p1) :: (store (<8 x s16>) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
entry:
@@ -748,10 +748,10 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s32>), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s32>) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD1]](<8 x s32>), [[LOAD]](p1) :: (store (<8 x s32>) into %ir.out, align 4, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
;
@@ -762,10 +762,10 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 68
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s32>), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s32>) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](<8 x s32>), [[LOAD]](p1) :: (store (<8 x s32>) into %ir.out, align 4, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
entry:
@@ -781,10 +781,10 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s32>), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s32>) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD1]](<8 x s32>), [[LOAD]](p1) :: (store (<8 x s32>) into %ir.out, align 4, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
;
@@ -795,10 +795,10 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 68
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s32>), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s32>) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](<8 x s32>), [[LOAD]](p1) :: (store (<8 x s32>) into %ir.out, align 4, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
entry:
@@ -814,10 +814,10 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s8>), addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s8>) from constant-pool, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD1]](<16 x s8>), [[LOAD]](p1) :: (store (<16 x s8>) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
;
@@ -828,10 +828,10 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 52
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s8>), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s8>) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](<16 x s8>), [[LOAD]](p1) :: (store (<16 x s8>) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
entry:
@@ -847,10 +847,10 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) {
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<16 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s16>), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<16 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s16>) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD1]](<16 x s16>), [[LOAD]](p1) :: (store (<16 x s16>) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
;
@@ -861,10 +861,10 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) {
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 68
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<16 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s16>), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<16 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s16>) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](<16 x s16>), [[LOAD]](p1) :: (store (<16 x s16>) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
entry:
@@ -880,10 +880,10 @@ define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s32>), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s32>) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD1]](<16 x s32>), [[LOAD]](p1) :: (store (<16 x s32>) into %ir.out, align 4, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
;
@@ -894,10 +894,10 @@ define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 100
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s32>), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s32>) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](<16 x s32>), [[LOAD]](p1) :: (store (<16 x s32>) into %ir.out, align 4, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
entry:
@@ -913,10 +913,10 @@ define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x flo
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s32>), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s32>) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD1]](<16 x s32>), [[LOAD]](p1) :: (store (<16 x s32>) into %ir.out, align 4, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
;
@@ -927,10 +927,10 @@ define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x flo
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 100
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s32>), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s32>) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](<16 x s32>), [[LOAD]](p1) :: (store (<16 x s32>) into %ir.out, align 4, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
entry:
@@ -946,10 +946,10 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s64), addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s64) from constant-pool, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD1]](s64), [[LOAD]](p1) :: (store (s64) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
;
@@ -960,10 +960,10 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s64) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](s64), [[LOAD]](p1) :: (store (s64) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
store i64 %a, ptr addrspace(1) %out, align 8
@@ -978,10 +978,10 @@ define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) {
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s64), addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s64) from constant-pool, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD1]](s64), [[LOAD]](p1) :: (store (s64) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
;
@@ -992,10 +992,10 @@ define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) {
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s64) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](s64), [[LOAD]](p1) :: (store (s64) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
entry:
@@ -1011,10 +1011,10 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind {
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s1), align 8, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s1) from constant-pool, align 8, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD1]](s1), [[LOAD]](p1) :: (store (s1) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
;
@@ -1025,10 +1025,10 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind {
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](s1), [[LOAD]](p1) :: (store (s1) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
store i1 %x, ptr addrspace(1) %out, align 1
@@ -1043,10 +1043,10 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s1), align 8, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s1) from constant-pool, align 8, addrspace 4)
; HSA-VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s1)
; HSA-VI-NEXT: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
@@ -1058,10 +1058,10 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s1)
; LEGACY-MESA-VI-NEXT: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
@@ -1078,10 +1078,10 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s1), align 8, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s1) from constant-pool, align 8, addrspace 4)
; HSA-VI-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[LOAD1]](s1)
; HSA-VI-NEXT: G_STORE [[ZEXT]](s64), [[LOAD]](p1) :: (store (s64) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
@@ -1093,10 +1093,10 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[LOAD1]](s1)
; LEGACY-MESA-VI-NEXT: G_STORE [[ZEXT]](s64), [[LOAD]](p1) :: (store (s64) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
@@ -1113,10 +1113,10 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s1), align 8, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s1) from constant-pool, align 8, addrspace 4)
; HSA-VI-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s1)
; HSA-VI-NEXT: G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
@@ -1128,10 +1128,10 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s1)
; LEGACY-MESA-VI-NEXT: G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
@@ -1148,10 +1148,10 @@ define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwin
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s1), align 8, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s1) from constant-pool, align 8, addrspace 4)
; HSA-VI-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[LOAD1]](s1)
; HSA-VI-NEXT: G_STORE [[SEXT]](s64), [[LOAD]](p1) :: (store (s64) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
@@ -1163,10 +1163,10 @@ define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwin
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[LOAD1]](s1)
; LEGACY-MESA-VI-NEXT: G_STORE [[SEXT]](s64), [[LOAD]](p1) :: (store (s64) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
@@ -1185,7 +1185,7 @@ define amdgpu_kernel void @empty_struct_arg({} %arg0, i32 %arg1) nounwind {
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; HSA-VI-NEXT: G_STORE [[LOAD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
@@ -1197,7 +1197,7 @@ define amdgpu_kernel void @empty_struct_arg({} %arg0, i32 %arg1) nounwind {
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32) from constant-pool, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
@@ -1213,7 +1213,7 @@ define amdgpu_kernel void @empty_array_arg([0 x i8] %arg0, i32 %arg1) nounwind {
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; HSA-VI-NEXT: G_STORE [[LOAD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
@@ -1225,7 +1225,7 @@ define amdgpu_kernel void @empty_array_arg([0 x i8] %arg0, i32 %arg1) nounwind {
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32) from constant-pool, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
@@ -1249,19 +1249,19 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8 %pad,
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s64), addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s64) from constant-pool, addrspace 4)
; HSA-VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; HSA-VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
- ; HSA-VI-NEXT: [[LOAD2:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s8), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD2:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s8) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
; HSA-VI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C3]](s64)
- ; HSA-VI-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p4) :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p4) :: (dereferenceable invariant load (s32) from constant-pool, align 8, addrspace 4)
; HSA-VI-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
; HSA-VI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C4]](s64)
- ; HSA-VI-NEXT: [[LOAD4:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD4]](p4) :: (dereferenceable invariant load (s64), addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD4:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD4]](p4) :: (dereferenceable invariant load (s64) from constant-pool, addrspace 4)
; HSA-VI-NEXT: [[C5:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
; HSA-VI-NEXT: G_STORE [[LOAD]](s32), [[C5]](p1) :: (volatile store (s32) into `ptr addrspace(1) null`, addrspace 1)
; HSA-VI-NEXT: G_STORE [[LOAD1]](s64), [[C5]](p1) :: (volatile store (s64) into `ptr addrspace(1) null`, addrspace 1)
@@ -1277,19 +1277,19 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8 %pad,
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32) from constant-pool, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s64) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 52
; LEGACY-MESA-VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD2:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s8), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD2:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s8) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 60
; LEGACY-MESA-VI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C3]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p4) :: (dereferenceable invariant load (s32), addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p4) :: (dereferenceable invariant load (s32) from constant-pool, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 68
; LEGACY-MESA-VI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C4]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD4:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD4]](p4) :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD4:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD4]](p4) :: (dereferenceable invariant load (s64) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C5:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD]](s32), [[C5]](p1) :: (volatile store (s32) into `ptr addrspace(1) null`, addrspace 1)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](s64), [[C5]](p1) :: (volatile store (s64) into `ptr addrspace(1) null`, addrspace 1)
@@ -1317,19 +1317,19 @@ define amdgpu_kernel void @pointer_in_struct_argument({ptr addrspace(3), ptr add
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s64), addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s64) from constant-pool, addrspace 4)
; HSA-VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; HSA-VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
- ; HSA-VI-NEXT: [[LOAD2:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s8), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD2:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s8) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
; HSA-VI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C3]](s64)
- ; HSA-VI-NEXT: [[LOAD3:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD3]](p4) :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD3:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD3]](p4) :: (dereferenceable invariant load (s32) from constant-pool, align 8, addrspace 4)
; HSA-VI-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
; HSA-VI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C4]](s64)
- ; HSA-VI-NEXT: [[LOAD4:%[0-9]+]]:_(p1234) = G_LOAD [[PTR_ADD4]](p4) :: (dereferenceable invariant load (s64), addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD4:%[0-9]+]]:_(p1234) = G_LOAD [[PTR_ADD4]](p4) :: (dereferenceable invariant load (s64) from constant-pool, addrspace 4)
; HSA-VI-NEXT: [[C5:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
; HSA-VI-NEXT: G_STORE [[LOAD]](p3), [[C5]](p1) :: (volatile store (p3) into `ptr addrspace(1) null`, addrspace 1)
; HSA-VI-NEXT: G_STORE [[LOAD1]](p1), [[C5]](p1) :: (volatile store (p1) into `ptr addrspace(1) null`, addrspace 1)
@@ -1345,19 +1345,19 @@ define amdgpu_kernel void @pointer_in_struct_argument({ptr addrspace(3), ptr add
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32) from constant-pool, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s64) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 52
; LEGACY-MESA-VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD2:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s8), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD2:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s8) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 60
; LEGACY-MESA-VI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C3]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD3:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD3]](p4) :: (dereferenceable invariant load (s32), addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD3:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD3]](p4) :: (dereferenceable invariant load (s32) from constant-pool, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 68
; LEGACY-MESA-VI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C4]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD4:%[0-9]+]]:_(p1234) = G_LOAD [[PTR_ADD4]](p4) :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD4:%[0-9]+]]:_(p1234) = G_LOAD [[PTR_ADD4]](p4) :: (dereferenceable invariant load (s64) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C5:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD]](p3), [[C5]](p1) :: (volatile store (p3) into `ptr addrspace(1) null`, addrspace 1)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](p1), [[C5]](p1) :: (volatile store (p1) into `ptr addrspace(1) null`, addrspace 1)
@@ -1387,16 +1387,16 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s64) from constant-pool, align 4, addrspace 4)
; HSA-VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 13
; HSA-VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
- ; HSA-VI-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s32), align 1, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s32) from constant-pool, align 1, addrspace 4)
; HSA-VI-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 17
; HSA-VI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C3]](s64)
- ; HSA-VI-NEXT: [[LOAD3:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD3]](p4) :: (dereferenceable invariant load (s64), align 1, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD3:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD3]](p4) :: (dereferenceable invariant load (s64) from constant-pool, align 1, addrspace 4)
; HSA-VI-NEXT: [[C4:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
; HSA-VI-NEXT: G_STORE [[LOAD]](s32), [[C4]](p1) :: (volatile store (s32) into `ptr addrspace(1) null`, addrspace 1)
; HSA-VI-NEXT: G_STORE [[LOAD1]](s64), [[C4]](p1) :: (volatile store (s64) into `ptr addrspace(1) null`, addrspace 1)
@@ -1411,16 +1411,16 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32) from constant-pool, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 40
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s64) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 49
; LEGACY-MESA-VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s32), align 1, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s32) from constant-pool, align 1, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 53
; LEGACY-MESA-VI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C3]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD3:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD3]](p4) :: (dereferenceable invariant load (s64), align 1, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD3:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD3]](p4) :: (dereferenceable invariant load (s64) from constant-pool, align 1, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C4:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD]](s32), [[C4]](p1) :: (volatile store (s32) into `ptr addrspace(1) null`, addrspace 1)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](s64), [[C4]](p1) :: (volatile store (s64) into `ptr addrspace(1) null`, addrspace 1)
@@ -1465,7 +1465,7 @@ define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s8) from %ir.in.byref, addrspace 4)
@@ -1480,7 +1480,7 @@ define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s8) from %ir.in.byref, addrspace 4)
@@ -1501,7 +1501,7 @@ define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %ou
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s16) from %ir.in.byref, addrspace 4)
@@ -1516,7 +1516,7 @@ define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %ou
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s16) from %ir.in.byref, addrspace 4)
@@ -1537,12 +1537,12 @@ define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %ou
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
; HSA-VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s32), addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s32) from constant-pool, addrspace 4)
; HSA-VI-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32) from %ir.in.byref, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
; HSA-VI-NEXT: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
@@ -1555,12 +1555,12 @@ define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %ou
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; LEGACY-MESA-VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 48
; LEGACY-MESA-VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s32) from constant-pool, align 16, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32) from %ir.in.byref, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
@@ -1579,12 +1579,12 @@ define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture %
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
; HSA-VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s32) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[LOAD2:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s32>) from %ir.in.byref, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD2]](<4 x s32>), [[LOAD]](p1) :: (volatile store (<4 x s32>) into %ir.out, align 4, addrspace 1)
; HSA-VI-NEXT: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
@@ -1597,12 +1597,12 @@ define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture %
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 52
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; LEGACY-MESA-VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 68
; LEGACY-MESA-VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s32), addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s32) from constant-pool, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[LOAD2:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s32>) from %ir.in.byref, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD2]](<4 x s32>), [[LOAD]](p1) :: (volatile store (<4 x s32>) into %ir.out, align 4, addrspace 1)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
@@ -1621,12 +1621,12 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 256
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 260
; HSA-VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s32), addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s32) from constant-pool, addrspace 4)
; HSA-VI-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32) from %ir.in.byref, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
; HSA-VI-NEXT: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
@@ -1639,12 +1639,12 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 292
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; LEGACY-MESA-VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 296
; LEGACY-MESA-VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s32) from constant-pool, align 8, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32) from %ir.in.byref, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
@@ -1663,12 +1663,12 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 128
; HSA-VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s32) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[LOAD2:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s32>) from %ir.in.byref, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD2]](<16 x s32>), [[LOAD]](p1) :: (volatile store (<16 x s32>) into %ir.out, align 4, addrspace 1)
; HSA-VI-NEXT: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
@@ -1681,12 +1681,12 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 100
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; LEGACY-MESA-VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 164
; LEGACY-MESA-VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s32), addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s32) from constant-pool, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[LOAD2:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s32>) from %ir.in.byref, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD2]](<16 x s32>), [[LOAD]](p1) :: (volatile store (<16 x s32>) into %ir.out, align 4, addrspace 1)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
@@ -1706,7 +1706,7 @@ define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out,
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI-NEXT: [[ADDRSPACE_CAST:%[0-9]+]]:_(p1) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4)
@@ -1721,7 +1721,7 @@ define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out,
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; LEGACY-MESA-VI-NEXT: [[ADDRSPACE_CAST:%[0-9]+]]:_(p1) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4)
@@ -1741,7 +1741,7 @@ define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, p
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI-NEXT: [[ADDRSPACE_CAST:%[0-9]+]]:_(p0) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4)
@@ -1756,7 +1756,7 @@ define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, p
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; LEGACY-MESA-VI-NEXT: [[ADDRSPACE_CAST:%[0-9]+]]:_(p0) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4)
@@ -1776,7 +1776,7 @@ define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocaptu
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI-NEXT: [[ADDRSPACE_CAST:%[0-9]+]]:_(p6) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4)
@@ -1791,7 +1791,7 @@ define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocaptu
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; LEGACY-MESA-VI-NEXT: [[ADDRSPACE_CAST:%[0-9]+]]:_(p6) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4)
@@ -1811,7 +1811,7 @@ define amdgpu_kernel void @byref_unknown_as_i32_arg(ptr addrspace(1) nocapture %
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI-NEXT: [[ADDRSPACE_CAST:%[0-9]+]]:_(p999) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4)
@@ -1826,7 +1826,7 @@ define amdgpu_kernel void @byref_unknown_as_i32_arg(ptr addrspace(1) nocapture %
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; LEGACY-MESA-VI-NEXT: [[ADDRSPACE_CAST:%[0-9]+]]:_(p999) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4)
@@ -1847,7 +1847,7 @@ define amdgpu_kernel void @byref_local_i32_arg(ptr addrspace(1) nocapture %out,
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI-NEXT: [[ADDRSPACE_CAST:%[0-9]+]]:_(p3) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4)
@@ -1862,7 +1862,7 @@ define amdgpu_kernel void @byref_local_i32_arg(ptr addrspace(1) nocapture %out,
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; LEGACY-MESA-VI-NEXT: [[ADDRSPACE_CAST:%[0-9]+]]:_(p3) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4)
@@ -1882,14 +1882,14 @@ define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocaptu
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
; HSA-VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
; HSA-VI-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; HSA-VI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C3]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p4) :: (dereferenceable invariant load (s32) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32) from %ir.in0.byref, addrspace 4)
; HSA-VI-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s32) from %ir.in1.byref, addrspace 4)
; HSA-VI-NEXT: G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
@@ -1904,14 +1904,14 @@ define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocaptu
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; LEGACY-MESA-VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 48
; LEGACY-MESA-VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
; LEGACY-MESA-VI-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 52
; LEGACY-MESA-VI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C3]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p4) :: (dereferenceable invariant load (s32), addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p4) :: (dereferenceable invariant load (s32) from constant-pool, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32) from %ir.in0.byref, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s32) from %ir.in1.byref, addrspace 4)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
@@ -1963,7 +1963,7 @@ define amdgpu_kernel void @p3i8_arg(ptr addrspace(3) %arg) nounwind {
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p3), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p3) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s8) = G_CONSTANT i8 9
; HSA-VI-NEXT: G_STORE [[C1]](s8), [[LOAD]](p3) :: (store (s8) into %ir.arg, align 4, addrspace 3)
; HSA-VI-NEXT: S_ENDPGM 0
@@ -1975,7 +1975,7 @@ define amdgpu_kernel void @p3i8_arg(ptr addrspace(3) %arg) nounwind {
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p3), addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p3) from constant-pool, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s8) = G_CONSTANT i8 9
; LEGACY-MESA-VI-NEXT: G_STORE [[C1]](s8), [[LOAD]](p3) :: (store (s8) into %ir.arg, align 4, addrspace 3)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
@@ -2015,7 +2015,7 @@ define amdgpu_kernel void @v2p1i8_arg(<2 x ptr addrspace(1)> %arg) nounwind {
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x p1>), addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x p1>) from constant-pool, addrspace 4)
; HSA-VI-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; HSA-VI-NEXT: G_STORE [[LOAD]](<2 x p1>), [[DEF]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) poison`, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
@@ -2027,7 +2027,7 @@ define amdgpu_kernel void @v2p1i8_arg(<2 x ptr addrspace(1)> %arg) nounwind {
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x p1>), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x p1>) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD]](<2 x p1>), [[DEF]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) poison`, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
@@ -2043,7 +2043,7 @@ define amdgpu_kernel void @v2p3i8_arg(<2 x ptr addrspace(3)> %arg) nounwind {
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x p3>), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x p3>) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; HSA-VI-NEXT: G_STORE [[LOAD]](<2 x p3>), [[DEF]](p1) :: (store (<2 x p3>) into `ptr addrspace(1) poison`, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
@@ -2055,7 +2055,7 @@ define amdgpu_kernel void @v2p3i8_arg(<2 x ptr addrspace(3)> %arg) nounwind {
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x p3>), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x p3>) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD]](<2 x p3>), [[DEF]](p1) :: (store (<2 x p3>) into `ptr addrspace(1) poison`, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
@@ -2071,10 +2071,10 @@ define amdgpu_kernel void @v2p1i8_in_struct_arg({ <2 x ptr addrspace(1)>, <2 x p
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x s64>), addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x s64>) from constant-pool, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s32>), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s32>) from constant-pool, align 16, addrspace 4)
; HSA-VI-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; HSA-VI-NEXT: G_STORE [[LOAD]](<2 x p1>), [[DEF]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) poison`, addrspace 1)
; HSA-VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
@@ -2089,10 +2089,10 @@ define amdgpu_kernel void @v2p1i8_in_struct_arg({ <2 x ptr addrspace(1)>, <2 x p
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x s64>) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 52
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s32>), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s32>) from constant-pool, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD]](<2 x p1>), [[DEF]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) poison`, addrspace 1)
; LEGACY-MESA-VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index 4e70c15..c935310 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -3850,8 +3850,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) poison`, addrspace 4)
; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<32 x s32>) from %ir.ptr0, addrspace 1)
- ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(p3) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (p3) from `ptr addrspace(1) poison`, addrspace 1)
- ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(p5) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (p5) from `ptr addrspace(1) poison`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (s32) from `ptr addrspace(1) poison`, addrspace 1)
+ ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[LOAD2]](s32)
+ ; CHECK-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p5) = G_INTTOPTR [[LOAD2]](s32)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32_p3_p5
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -3880,10 +3881,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
; CHECK-NEXT: G_STORE [[UV31]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack, align 16, addrspace 5)
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C4]](s32)
- ; CHECK-NEXT: G_STORE [[LOAD2]](p3), [[PTR_ADD2]](p5) :: (store (p3) into stack + 4, addrspace 5)
+ ; CHECK-NEXT: G_STORE [[INTTOPTR]](p3), [[PTR_ADD2]](p5) :: (store (p3) into stack + 4, addrspace 5)
; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C5]](s32)
- ; CHECK-NEXT: G_STORE [[LOAD3]](p5), [[PTR_ADD3]](p5) :: (store (p5) into stack + 8, align 8, addrspace 5)
+ ; CHECK-NEXT: G_STORE [[INTTOPTR1]](p5), [[PTR_ADD3]](p5) :: (store (p5) into stack + 8, align 8, addrspace 5)
; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
index e5cd071..b290c31 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
@@ -66,7 +66,7 @@ define amdgpu_kernel void @asm_simple_agpr_clobber() {
define i32 @asm_vgpr_early_clobber() {
; CHECK-LABEL: name: asm_vgpr_early_clobber
; CHECK: bb.1 (%ir-block.0):
- ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %8, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %9, !1
+ ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 1245195 /* regdef-ec:VGPR_32 */, def early-clobber %8, 1245195 /* regdef-ec:VGPR_32 */, def early-clobber %9, !1
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %9
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
@@ -94,7 +94,7 @@ entry:
define i32 @test_single_vgpr_output() nounwind {
; CHECK-LABEL: name: test_single_vgpr_output
; CHECK: bb.1.entry:
- ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8
+ ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %8
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8
; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32)
; CHECK-NEXT: SI_RETURN implicit $vgpr0
@@ -106,7 +106,7 @@ entry:
define i32 @test_single_sgpr_output_s32() nounwind {
; CHECK-LABEL: name: test_single_sgpr_output_s32
; CHECK: bb.1.entry:
- ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %8
+ ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:SReg_32 */, def %8
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8
; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32)
; CHECK-NEXT: SI_RETURN implicit $vgpr0
@@ -119,7 +119,7 @@ entry:
define float @test_multiple_register_outputs_same() #0 {
; CHECK-LABEL: name: test_multiple_register_outputs_same
; CHECK: bb.1 (%ir-block.0):
- ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8, 1835018 /* regdef:VGPR_32 */, def %9
+ ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %8, 1245194 /* regdef:VGPR_32 */, def %9
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %9
; CHECK-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY]], [[COPY1]]
@@ -136,7 +136,7 @@ define float @test_multiple_register_outputs_same() #0 {
define double @test_multiple_register_outputs_mixed() #0 {
; CHECK-LABEL: name: test_multiple_register_outputs_mixed
; CHECK: bb.1 (%ir-block.0):
- ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8, 3407882 /* regdef:VReg_64 */, def %9
+ ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %8, 2818058 /* regdef:VReg_64 */, def %9
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY %9
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
@@ -171,7 +171,7 @@ define amdgpu_kernel void @test_input_vgpr_imm() {
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[C]](s32)
- ; CHECK-NEXT: INLINEASM &"v_mov_b32 v0, $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"v_mov_b32 v0, $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY1]]
; CHECK-NEXT: S_ENDPGM 0
call void asm sideeffect "v_mov_b32 v0, $0", "v"(i32 42)
ret void
@@ -185,7 +185,7 @@ define amdgpu_kernel void @test_input_sgpr_imm() {
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[C]](s32)
- ; CHECK-NEXT: INLINEASM &"s_mov_b32 s0, $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"s_mov_b32 s0, $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[COPY1]]
; CHECK-NEXT: S_ENDPGM 0
call void asm sideeffect "s_mov_b32 s0, $0", "s"(i32 42)
ret void
@@ -212,7 +212,7 @@ define float @test_input_vgpr(i32 %src) nounwind {
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
- ; CHECK-NEXT: INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %9, 1835017 /* reguse:VGPR_32 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %9, 1245193 /* reguse:VGPR_32 */, [[COPY1]]
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %9
; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32)
; CHECK-NEXT: SI_RETURN implicit $vgpr0
@@ -227,7 +227,7 @@ define i32 @test_memory_constraint(ptr addrspace(3) %a) nounwind {
; CHECK-NEXT: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
- ; CHECK-NEXT: INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 1835018 /* regdef:VGPR_32 */, def %9, 262158 /* mem:m */, [[COPY]](p3)
+ ; CHECK-NEXT: INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 1245194 /* regdef:VGPR_32 */, def %9, 262158 /* mem:m */, [[COPY]](p3)
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %9
; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32)
; CHECK-NEXT: SI_RETURN implicit $vgpr0
@@ -244,7 +244,7 @@ define i32 @test_vgpr_matching_constraint(i32 %a) nounwind {
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[AND]](s32)
- ; CHECK-NEXT: INLINEASM &";", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
+ ; CHECK-NEXT: INLINEASM &";", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %11, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %11
; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32)
; CHECK-NEXT: SI_RETURN implicit $vgpr0
@@ -256,13 +256,13 @@ define i32 @test_vgpr_matching_constraint(i32 %a) nounwind {
define i32 @test_sgpr_matching_constraint() nounwind {
; CHECK-LABEL: name: test_sgpr_matching_constraint
; CHECK: bb.1.entry:
- ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %8
+ ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:SReg_32 */, def %8
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8
- ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %10
+ ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 1835018 /* regdef:SReg_32 */, def %10
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %10
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]](s32)
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY1]](s32)
- ; CHECK-NEXT: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %12, 2424841 /* reguse:SReg_32 */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3)
+ ; CHECK-NEXT: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 1835018 /* regdef:SReg_32 */, def %12, 1835017 /* reguse:SReg_32 */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3)
; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY %12
; CHECK-NEXT: $vgpr0 = COPY [[COPY4]](s32)
; CHECK-NEXT: SI_RETURN implicit $vgpr0
@@ -285,7 +285,7 @@ define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind {
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]](s32)
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32)
- ; CHECK-NEXT: INLINEASM &"; ", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11, 1835018 /* regdef:VGPR_32 */, def %12, 1835018 /* regdef:VGPR_32 */, def %13, 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY4]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY5]](tied-def 5)
+ ; CHECK-NEXT: INLINEASM &"; ", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %11, 1245194 /* regdef:VGPR_32 */, def %12, 1245194 /* regdef:VGPR_32 */, def %13, 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY4]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY5]](tied-def 5)
; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY %11
; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY %12
; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY %13
@@ -306,10 +306,10 @@ define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind {
define i32 @test_sgpr_to_vgpr_move_matching_constraint() nounwind {
; CHECK-LABEL: name: test_sgpr_to_vgpr_move_matching_constraint
; CHECK: bb.1.entry:
- ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %8
+ ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:SReg_32 */, def %8
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
- ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
+ ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %10
; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32)
; CHECK-NEXT: SI_RETURN implicit $vgpr0
@@ -331,6 +331,100 @@ define amdgpu_kernel void @asm_constraint_n_n() {
ret void
}
+define void @test_indirectify_i32_value(i32 %x, i32 %y) {
+ ; CHECK-LABEL: name: test_indirectify_i32_value
+ ; CHECK: bb.1.entry:
+ ; CHECK-NEXT: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0
+ ; CHECK-NEXT: G_STORE [[COPY]](s32), [[FRAME_INDEX]](p5) :: (store (s32) into %stack.0, addrspace 5)
+ ; CHECK-NEXT: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.1
+ ; CHECK-NEXT: G_STORE [[COPY1]](s32), [[FRAME_INDEX1]](p5) :: (store (s32) into %stack.1, addrspace 5)
+ ; CHECK-NEXT: INLINEASM &"", 25 /* sideeffect mayload maystore attdialect */, 262158 /* mem:m */, [[FRAME_INDEX]](p5), 262158 /* mem:m */, [[FRAME_INDEX1]](p5)
+ ; CHECK-NEXT: SI_RETURN
+entry:
+ tail call void asm sideeffect "", "imr,imr,~{memory}"(i32 %x, i32 %y)
+ ret void
+}
+
+define void @test_indirectify_i32_constant() {
+ ; CHECK-LABEL: name: test_indirectify_i32_constant
+ ; CHECK: bb.1.entry:
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0
+ ; CHECK-NEXT: G_STORE [[C]](s32), [[FRAME_INDEX]](p5) :: (store (s32) into %stack.0, addrspace 5)
+ ; CHECK-NEXT: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.1
+ ; CHECK-NEXT: G_STORE [[C1]](s32), [[FRAME_INDEX1]](p5) :: (store (s32) into %stack.1, addrspace 5)
+ ; CHECK-NEXT: INLINEASM &"", 25 /* sideeffect mayload maystore attdialect */, 262158 /* mem:m */, [[FRAME_INDEX]](p5), 262158 /* mem:m */, [[FRAME_INDEX1]](p5)
+ ; CHECK-NEXT: SI_RETURN
+entry:
+ tail call void asm sideeffect "", "imr,imr,~{memory}"(i32 42, i32 0)
+ ret void
+}
+
+
+define void @test_indirectify_i16_value(i16 %val) {
+ ; CHECK-LABEL: name: test_indirectify_i16_value
+ ; CHECK: bb.1.entry:
+ ; CHECK-NEXT: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0
+ ; CHECK-NEXT: G_STORE [[TRUNC]](s16), [[FRAME_INDEX]](p5) :: (store (s16) into %stack.0, addrspace 5)
+ ; CHECK-NEXT: INLINEASM &"", 25 /* sideeffect mayload maystore attdialect */, 262158 /* mem:m */, [[FRAME_INDEX]](p5)
+ ; CHECK-NEXT: SI_RETURN
+entry:
+ tail call void asm sideeffect "", "imr,~{memory}"(i16 %val)
+ ret void
+}
+
+define void @test_indirectify_i16_constant() {
+ ; CHECK-LABEL: name: test_indirectify_i16_constant
+ ; CHECK: bb.1.entry:
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 42
+ ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0
+ ; CHECK-NEXT: G_STORE [[C]](s16), [[FRAME_INDEX]](p5) :: (store (s16) into %stack.0, addrspace 5)
+ ; CHECK-NEXT: INLINEASM &"", 25 /* sideeffect mayload maystore attdialect */, 262158 /* mem:m */, [[FRAME_INDEX]](p5)
+ ; CHECK-NEXT: SI_RETURN
+entry:
+ tail call void asm sideeffect "", "imr,~{memory}"(i16 42)
+ ret void
+}
+
+define void @test_indirectify_i64_value(i64 %val) {
+ ; CHECK-LABEL: name: test_indirectify_i64_value
+ ; CHECK: bb.1.entry:
+ ; CHECK-NEXT: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0
+ ; CHECK-NEXT: G_STORE [[MV]](s64), [[FRAME_INDEX]](p5) :: (store (s64) into %stack.0, addrspace 5)
+ ; CHECK-NEXT: INLINEASM &"", 25 /* sideeffect mayload maystore attdialect */, 262158 /* mem:m */, [[FRAME_INDEX]](p5)
+ ; CHECK-NEXT: SI_RETURN
+entry:
+ tail call void asm sideeffect "", "imr,~{memory}"(i64 %val)
+ ret void
+}
+
+define void @test_indirectify_i64_constant() {
+ ; CHECK-LABEL: name: test_indirectify_i64_constant
+ ; CHECK: bb.1.entry:
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 42
+ ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0
+ ; CHECK-NEXT: G_STORE [[C]](s64), [[FRAME_INDEX]](p5) :: (store (s64) into %stack.0, addrspace 5)
+ ; CHECK-NEXT: INLINEASM &"", 25 /* sideeffect mayload maystore attdialect */, 262158 /* mem:m */, [[FRAME_INDEX]](p5)
+ ; CHECK-NEXT: SI_RETURN
+entry:
+ tail call void asm sideeffect "", "imr,~{memory}"(i64 42)
+ ret void
+}
+
!llvm.module.flags = !{!1}
!0 = !{i32 70}
!1 = !{i32 1, !"amdhsa_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll
index 82886ab..e1ac8ba 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -amdgpu-enable-lower-module-lds=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -amdgpu-enable-lower-module-lds=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s
; FIXME: Merge with DAG test
@lds.external = external unnamed_addr addrspace(3) global [0 x i32]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll
index cabb37c..3396eae 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -global-isel -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -global-isel -new-reg-bank-select -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel -new-reg-bank-select -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: not llc -mtriple=amdgcn -mcpu=tahiti -global-isel < %s 2>&1 | FileCheck %s
-; RUN: not llc -mtriple=amdgcn -mcpu=tonga -global-isel < %s 2>&1 | FileCheck %s
+; RUN: not llc -mtriple=amdgcn -mcpu=tahiti -global-isel -new-reg-bank-select < %s 2>&1 | FileCheck %s
+; RUN: not llc -mtriple=amdgcn -mcpu=tonga -global-isel -new-reg-bank-select < %s 2>&1 | FileCheck %s
; CHECK: error: lds: unsupported initializer for address space
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
index 4471980..e83b4ea 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
@@ -428,9 +428,9 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:_(p6) = COPY $vgpr0
; GCN-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p6)
- ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; GCN-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[PTRTOINT]](s32), [[C]](s32)
- ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p4)
+ ; GCN-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[PTRTOINT]](s32)
+ ; GCN-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[ZEXT]](s64)
+ ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
%0:_(p6) = COPY $vgpr0
%1:_(p4) = G_ADDRSPACE_CAST %0
$vgpr0_vgpr1 = COPY %1
@@ -485,9 +485,9 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:_(p6) = COPY $vgpr0
; GCN-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p6)
- ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; GCN-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[C]](s32)
- ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p0)
+ ; GCN-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[PTRTOINT]](s32)
+ ; GCN-NEXT: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[ZEXT]](s64)
+ ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p0)
%0:_(p6) = COPY $vgpr0
%1:_(p0) = G_ADDRSPACE_CAST %0
$vgpr0_vgpr1 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant-32bit.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant-32bit.mir
index b91f1f4..b9c0217 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant-32bit.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant-32bit.mir
@@ -12,24 +12,24 @@ body: |
; CI-NEXT: {{ $}}
; CI-NEXT: [[COPY:%[0-9]+]]:_(p6) = COPY $vgpr0
; CI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p6)
- ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[PTRTOINT]](s32), [[C]](s32)
- ; CI-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[MV]](p4) :: (load (s8), addrspace 6)
- ; CI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
- ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = nuw inbounds G_PTR_ADD [[MV]], [[C1]](s64)
+ ; CI-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[PTRTOINT]](s32)
+ ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[ZEXT]](s64)
+ ; CI-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[INTTOPTR]](p4) :: (load (s8), addrspace 6)
+ ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+ ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = nuw inbounds G_PTR_ADD [[INTTOPTR]], [[C]](s64)
; CI-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p4) :: (load (s8) from unknown-address + 1, addrspace 6)
- ; CI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
- ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C2]](s32)
+ ; CI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+ ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
- ; CI-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
- ; CI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = nuw inbounds G_PTR_ADD [[MV]], [[C3]](s64)
+ ; CI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+ ; CI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = nuw inbounds G_PTR_ADD [[INTTOPTR]], [[C2]](s64)
; CI-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p4) :: (load (s8) from unknown-address + 2, addrspace 6)
- ; CI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p4) = nuw inbounds G_PTR_ADD [[PTR_ADD1]], [[C1]](s64)
+ ; CI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p4) = nuw inbounds G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (load (s8) from unknown-address + 3, addrspace 6)
- ; CI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C2]](s32)
+ ; CI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
- ; CI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C4]](s32)
+ ; CI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
; CI-NEXT: $vgpr0 = COPY [[OR2]](s32)
%0:_(p6) = COPY $vgpr0
@@ -48,9 +48,9 @@ body: |
; CI-NEXT: {{ $}}
; CI-NEXT: [[COPY:%[0-9]+]]:_(p6) = COPY $vgpr0
; CI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p6)
- ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[PTRTOINT]](s32), [[C]](s32)
- ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load (s32), addrspace 6)
+ ; CI-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[PTRTOINT]](s32)
+ ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[ZEXT]](s64)
+ ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[INTTOPTR]](p4) :: (load (s32), addrspace 6)
; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
%0:_(p6) = COPY $vgpr0
%1:_(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 6)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-constant-32bit.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-constant-32bit.mir
index d87212d..067844d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-constant-32bit.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-constant-32bit.mir
@@ -13,9 +13,9 @@ body: |
; CI-NEXT: {{ $}}
; CI-NEXT: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0
; CI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p6)
- ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[PTRTOINT]](s32), [[C]](s32)
- ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load (s32), addrspace 6)
+ ; CI-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[PTRTOINT]](s32)
+ ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[ZEXT]](s64)
+ ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[INTTOPTR]](p4) :: (load (s32), addrspace 6)
; CI-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[LOAD]](s32)
; CI-NEXT: $vgpr0_vgpr1 = COPY [[SEXT]](s64)
%0:_(p6) = COPY $sgpr0
@@ -34,9 +34,9 @@ body: |
; CI-NEXT: {{ $}}
; CI-NEXT: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0
; CI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p6)
- ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[PTRTOINT]](s32), [[C]](s32)
- ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load (s32), align 2, addrspace 6)
+ ; CI-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[PTRTOINT]](s32)
+ ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[ZEXT]](s64)
+ ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[INTTOPTR]](p4) :: (load (s32), align 2, addrspace 6)
; CI-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[LOAD]](s32)
; CI-NEXT: $vgpr0_vgpr1 = COPY [[SEXT]](s64)
%0:_(p6) = COPY $sgpr0
@@ -55,9 +55,9 @@ body: |
; CI-NEXT: {{ $}}
; CI-NEXT: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0
; CI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p6)
- ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[PTRTOINT]](s32), [[C]](s32)
- ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load (s32), align 1, addrspace 6)
+ ; CI-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[PTRTOINT]](s32)
+ ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[ZEXT]](s64)
+ ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[INTTOPTR]](p4) :: (load (s32), align 1, addrspace 6)
; CI-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[LOAD]](s32)
; CI-NEXT: $vgpr0_vgpr1 = COPY [[SEXT]](s64)
%0:_(p6) = COPY $sgpr0
@@ -76,9 +76,9 @@ body: |
; CI-NEXT: {{ $}}
; CI-NEXT: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0
; CI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p6)
- ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[PTRTOINT]](s32), [[C]](s32)
- ; CI-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[MV]](p4) :: (load (s8), addrspace 6)
+ ; CI-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[PTRTOINT]](s32)
+ ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[ZEXT]](s64)
+ ; CI-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[INTTOPTR]](p4) :: (load (s8), addrspace 6)
; CI-NEXT: $vgpr0 = COPY [[SEXTLOAD]](s32)
%0:_(p6) = COPY $sgpr0
%1:_(s32) = G_SEXTLOAD %0 :: (load (s8), align 1, addrspace 6)
@@ -96,9 +96,9 @@ body: |
; CI-NEXT: {{ $}}
; CI-NEXT: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0
; CI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p6)
- ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[PTRTOINT]](s32), [[C]](s32)
- ; CI-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[MV]](p4) :: (load (s16), addrspace 6)
+ ; CI-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[PTRTOINT]](s32)
+ ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[ZEXT]](s64)
+ ; CI-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[INTTOPTR]](p4) :: (load (s16), addrspace 6)
; CI-NEXT: $vgpr0 = COPY [[SEXTLOAD]](s32)
%0:_(p6) = COPY $sgpr0
%1:_(s32) = G_SEXTLOAD %0 :: (load (s16), align 2, addrspace 6)
@@ -116,9 +116,9 @@ body: |
; CI-NEXT: {{ $}}
; CI-NEXT: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0
; CI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p6)
- ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[PTRTOINT]](s32), [[C]](s32)
- ; CI-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[MV]](p4) :: (load (s16), align 1, addrspace 6)
+ ; CI-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[PTRTOINT]](s32)
+ ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[ZEXT]](s64)
+ ; CI-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[INTTOPTR]](p4) :: (load (s16), align 1, addrspace 6)
; CI-NEXT: $vgpr0 = COPY [[SEXTLOAD]](s32)
%0:_(p6) = COPY $sgpr0
%1:_(s32) = G_SEXTLOAD %0 :: (load (s16), align 1, addrspace 6)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zextload-constant-32bit.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zextload-constant-32bit.mir
index a4971e94..c72cdd5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zextload-constant-32bit.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zextload-constant-32bit.mir
@@ -14,11 +14,11 @@ body: |
; CI-NEXT: {{ $}}
; CI-NEXT: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0
; CI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p6)
- ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[PTRTOINT]](s32), [[C]](s32)
- ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load (s32), addrspace 6)
- ; CI-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[LOAD]](s32)
- ; CI-NEXT: $vgpr0_vgpr1 = COPY [[ZEXT]](s64)
+ ; CI-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[PTRTOINT]](s32)
+ ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[ZEXT]](s64)
+ ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[INTTOPTR]](p4) :: (load (s32), addrspace 6)
+ ; CI-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[LOAD]](s32)
+ ; CI-NEXT: $vgpr0_vgpr1 = COPY [[ZEXT1]](s64)
%0:_(p6) = COPY $sgpr0
%1:_(s64) = G_ZEXTLOAD %0 :: (load (s32), align 4, addrspace 6)
$vgpr0_vgpr1 = COPY %1
@@ -35,11 +35,11 @@ body: |
; CI-NEXT: {{ $}}
; CI-NEXT: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0
; CI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p6)
- ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[PTRTOINT]](s32), [[C]](s32)
- ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load (s32), align 2, addrspace 6)
- ; CI-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[LOAD]](s32)
- ; CI-NEXT: $vgpr0_vgpr1 = COPY [[ZEXT]](s64)
+ ; CI-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[PTRTOINT]](s32)
+ ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[ZEXT]](s64)
+ ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[INTTOPTR]](p4) :: (load (s32), align 2, addrspace 6)
+ ; CI-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[LOAD]](s32)
+ ; CI-NEXT: $vgpr0_vgpr1 = COPY [[ZEXT1]](s64)
%0:_(p6) = COPY $sgpr0
%1:_(s64) = G_ZEXTLOAD %0 :: (load (s32), align 2, addrspace 6)
$vgpr0_vgpr1 = COPY %1
@@ -56,11 +56,11 @@ body: |
; CI-NEXT: {{ $}}
; CI-NEXT: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0
; CI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p6)
- ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[PTRTOINT]](s32), [[C]](s32)
- ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load (s32), align 1, addrspace 6)
- ; CI-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[LOAD]](s32)
- ; CI-NEXT: $vgpr0_vgpr1 = COPY [[ZEXT]](s64)
+ ; CI-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[PTRTOINT]](s32)
+ ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[ZEXT]](s64)
+ ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[INTTOPTR]](p4) :: (load (s32), align 1, addrspace 6)
+ ; CI-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[LOAD]](s32)
+ ; CI-NEXT: $vgpr0_vgpr1 = COPY [[ZEXT1]](s64)
%0:_(p6) = COPY $sgpr0
%1:_(s64) = G_ZEXTLOAD %0 :: (load (s32), align 1, addrspace 6)
$vgpr0_vgpr1 = COPY %1
@@ -77,9 +77,9 @@ body: |
; CI-NEXT: {{ $}}
; CI-NEXT: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0
; CI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p6)
- ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[PTRTOINT]](s32), [[C]](s32)
- ; CI-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[MV]](p4) :: (load (s8), addrspace 6)
+ ; CI-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[PTRTOINT]](s32)
+ ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[ZEXT]](s64)
+ ; CI-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[INTTOPTR]](p4) :: (load (s8), addrspace 6)
; CI-NEXT: $vgpr0 = COPY [[ZEXTLOAD]](s32)
%0:_(p6) = COPY $sgpr0
%1:_(s32) = G_ZEXTLOAD %0 :: (load (s8), align 1, addrspace 6)
@@ -97,9 +97,9 @@ body: |
; CI-NEXT: {{ $}}
; CI-NEXT: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0
; CI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p6)
- ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[PTRTOINT]](s32), [[C]](s32)
- ; CI-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[MV]](p4) :: (load (s16), addrspace 6)
+ ; CI-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[PTRTOINT]](s32)
+ ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[ZEXT]](s64)
+ ; CI-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[INTTOPTR]](p4) :: (load (s16), addrspace 6)
; CI-NEXT: $vgpr0 = COPY [[ZEXTLOAD]](s32)
%0:_(p6) = COPY $sgpr0
%1:_(s32) = G_ZEXTLOAD %0 :: (load (s16), align 2, addrspace 6)
@@ -117,9 +117,9 @@ body: |
; CI-NEXT: {{ $}}
; CI-NEXT: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0
; CI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p6)
- ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[PTRTOINT]](s32), [[C]](s32)
- ; CI-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[MV]](p4) :: (load (s16), align 1, addrspace 6)
+ ; CI-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[PTRTOINT]](s32)
+ ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[ZEXT]](s64)
+ ; CI-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[INTTOPTR]](p4) :: (load (s16), align 1, addrspace 6)
; CI-NEXT: $vgpr0 = COPY [[ZEXTLOAD]](s32)
%0:_(p6) = COPY $sgpr0
%1:_(s32) = G_ZEXTLOAD %0 :: (load (s16), align 1, addrspace 6)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
index 02d0e52..d16dc348 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
@@ -34,6 +34,7 @@ define amdgpu_cs i16 @abs_sgpr_i16(i16 inreg %arg) {
;
; GFX1250-LABEL: abs_sgpr_i16:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_sext_i32_i16 s0, s0
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_abs_i32 s0, s0
@@ -43,10 +44,26 @@ define amdgpu_cs i16 @abs_sgpr_i16(i16 inreg %arg) {
}
define amdgpu_cs i32 @abs_sgpr_i32(i32 inreg %arg) {
-; GFX-LABEL: abs_sgpr_i32:
-; GFX: ; %bb.0:
-; GFX-NEXT: s_abs_i32 s0, s0
-; GFX-NEXT: ; return to shader part epilog
+; GFX6-LABEL: abs_sgpr_i32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_abs_i32 s0, s0
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: abs_sgpr_i32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_abs_i32 s0, s0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: abs_sgpr_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_abs_i32 s0, s0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_sgpr_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_abs_i32 s0, s0
+; GFX1250-NEXT: ; return to shader part epilog
%res = call i32 @llvm.abs.i32(i32 %arg, i1 false)
ret i32 %res
}
@@ -81,6 +98,7 @@ define amdgpu_cs i64 @abs_sgpr_i64(i64 inreg %arg) {
;
; GFX1250-LABEL: abs_sgpr_i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_ashr_i32 s2, s1, 31
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_mov_b32 s3, s2
@@ -93,120 +111,146 @@ define amdgpu_cs i64 @abs_sgpr_i64(i64 inreg %arg) {
}
define amdgpu_cs <4 x i32> @abs_sgpr_v4i32(<4 x i32> inreg %arg) {
-; GFX-LABEL: abs_sgpr_v4i32:
-; GFX: ; %bb.0:
-; GFX-NEXT: s_abs_i32 s0, s0
-; GFX-NEXT: s_abs_i32 s1, s1
-; GFX-NEXT: s_abs_i32 s2, s2
-; GFX-NEXT: s_abs_i32 s3, s3
-; GFX-NEXT: ; return to shader part epilog
+; GFX6-LABEL: abs_sgpr_v4i32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_abs_i32 s0, s0
+; GFX6-NEXT: s_abs_i32 s1, s1
+; GFX6-NEXT: s_abs_i32 s2, s2
+; GFX6-NEXT: s_abs_i32 s3, s3
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: abs_sgpr_v4i32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_abs_i32 s0, s0
+; GFX8-NEXT: s_abs_i32 s1, s1
+; GFX8-NEXT: s_abs_i32 s2, s2
+; GFX8-NEXT: s_abs_i32 s3, s3
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: abs_sgpr_v4i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_abs_i32 s0, s0
+; GFX10-NEXT: s_abs_i32 s1, s1
+; GFX10-NEXT: s_abs_i32 s2, s2
+; GFX10-NEXT: s_abs_i32 s3, s3
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_sgpr_v4i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_abs_i32 s0, s0
+; GFX1250-NEXT: s_abs_i32 s1, s1
+; GFX1250-NEXT: s_abs_i32 s2, s2
+; GFX1250-NEXT: s_abs_i32 s3, s3
+; GFX1250-NEXT: ; return to shader part epilog
%res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %arg, i1 false)
ret <4 x i32> %res
}
-define amdgpu_cs i16 @abs_vgpr_i16(i16 %arg) {
+define i16 @abs_vgpr_i16(i16 %arg) {
; GFX6-LABEL: abs_vgpr_i16:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0
; GFX6-NEXT: v_max_i32_e32 v0, v0, v1
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_i16:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_sub_u16_e32 v1, 0, v0
; GFX8-NEXT: v_max_i16_e32 v0, v0, v1
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_i16:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_sub_nc_u16 v1, 0, v0
; GFX10-NEXT: v_max_i16 v0, v0, v1
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_i16:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_sub_nc_u16 v1, 0, v0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_max_i16 v0, v0, v1
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
ret i16 %res
}
-define amdgpu_cs i32 @abs_vgpr_i32(i32 %arg) {
+define i32 @abs_vgpr_i32(i32 %arg) {
; GFX6-LABEL: abs_vgpr_i32:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0
; GFX6-NEXT: v_max_i32_e32 v0, v0, v1
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_i32:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 0, v0
; GFX8-NEXT: v_max_i32_e32 v0, v0, v1
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_i32:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0
; GFX10-NEXT: v_max_i32_e32 v0, v0, v1
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_sub_nc_u32_e32 v1, 0, v0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_max_i32_e32 v0, v0, v1
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call i32 @llvm.abs.i32(i32 %arg, i1 false)
ret i32 %res
}
-define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) {
+define i64 @abs_vgpr_i64(i64 %arg) {
; GFX6-LABEL: abs_vgpr_i64:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX6-NEXT: v_xor_b32_e32 v1, v1, v2
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_i64:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
; GFX8-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX8-NEXT: v_xor_b32_e32 v1, v1, v2
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_i64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
; GFX10-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX10-NEXT: v_xor_b32_e32 v1, v1, v2
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_mov_b32_e32 v3, v2
@@ -214,17 +258,15 @@ define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) {
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX1250-NEXT: v_xor_b32_e32 v1, v1, v2
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call i64 @llvm.abs.i64(i64 %arg, i1 false)
ret i64 %res
}
-define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
+define <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
; GFX6-LABEL: abs_vgpr_v4i32:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v0
; GFX6-NEXT: v_max_i32_e32 v0, v0, v4
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
@@ -233,14 +275,11 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
; GFX6-NEXT: v_max_i32_e32 v2, v2, v4
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
; GFX6-NEXT: v_max_i32_e32 v3, v3, v4
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: v_readfirstlane_b32 s2, v2
-; GFX6-NEXT: v_readfirstlane_b32 s3, v3
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_v4i32:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v0
; GFX8-NEXT: v_max_i32_e32 v0, v0, v4
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v1
@@ -249,14 +288,11 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
; GFX8-NEXT: v_max_i32_e32 v2, v2, v4
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v3
; GFX8-NEXT: v_max_i32_e32 v3, v3, v4
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
-; GFX8-NEXT: v_readfirstlane_b32 s2, v2
-; GFX8-NEXT: v_readfirstlane_b32 s3, v3
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_v4i32:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_sub_nc_u32_e32 v4, 0, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v5, 0, v1
; GFX10-NEXT: v_sub_nc_u32_e32 v6, 0, v2
@@ -265,14 +301,12 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
; GFX10-NEXT: v_max_i32_e32 v1, v1, v5
; GFX10-NEXT: v_max_i32_e32 v2, v2, v6
; GFX10-NEXT: v_max_i32_e32 v3, v3, v7
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: v_readfirstlane_b32 s2, v2
-; GFX10-NEXT: v_readfirstlane_b32 s3, v3
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_v4i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_sub_nc_u32 v4, 0, v0 :: v_dual_sub_nc_u32 v5, 0, v1
; GFX1250-NEXT: v_dual_sub_nc_u32 v6, 0, v2 :: v_dual_sub_nc_u32 v7, 0, v3
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -281,67 +315,85 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_max_i32_e32 v2, v2, v6
; GFX1250-NEXT: v_max_i32_e32 v3, v3, v7
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_readfirstlane_b32 s2, v2
-; GFX1250-NEXT: v_readfirstlane_b32 s3, v3
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %arg, i1 false)
ret <4 x i32> %res
}
define amdgpu_cs <2 x i8> @abs_sgpr_v2i8(<2 x i8> inreg %arg) {
-; GFX-LABEL: abs_sgpr_v2i8:
-; GFX: ; %bb.0:
-; GFX-NEXT: s_sext_i32_i8 s0, s0
-; GFX-NEXT: s_sext_i32_i8 s1, s1
-; GFX-NEXT: s_abs_i32 s0, s0
-; GFX-NEXT: s_abs_i32 s1, s1
-; GFX-NEXT: ; return to shader part epilog
+; GFX6-LABEL: abs_sgpr_v2i8:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_sext_i32_i8 s0, s0
+; GFX6-NEXT: s_sext_i32_i8 s1, s1
+; GFX6-NEXT: s_abs_i32 s0, s0
+; GFX6-NEXT: s_abs_i32 s1, s1
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: abs_sgpr_v2i8:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_sext_i32_i8 s0, s0
+; GFX8-NEXT: s_sext_i32_i8 s1, s1
+; GFX8-NEXT: s_abs_i32 s0, s0
+; GFX8-NEXT: s_abs_i32 s1, s1
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: abs_sgpr_v2i8:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_sext_i32_i8 s0, s0
+; GFX10-NEXT: s_sext_i32_i8 s1, s1
+; GFX10-NEXT: s_abs_i32 s0, s0
+; GFX10-NEXT: s_abs_i32 s1, s1
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_sgpr_v2i8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_sext_i32_i8 s0, s0
+; GFX1250-NEXT: s_sext_i32_i8 s1, s1
+; GFX1250-NEXT: s_abs_i32 s0, s0
+; GFX1250-NEXT: s_abs_i32 s1, s1
+; GFX1250-NEXT: ; return to shader part epilog
%res = call <2 x i8> @llvm.abs.v2i8(<2 x i8> %arg, i1 false)
ret <2 x i8> %res
}
-define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
+define <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
; GFX6-LABEL: abs_vgpr_v2i8:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX6-NEXT: v_max_i32_e32 v0, v0, v2
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1
; GFX6-NEXT: v_max_i32_e32 v1, v1, v2
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_v2i8:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: v_sub_u16_sdwa v3, v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_sub_u16_sdwa v2, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_max_i16_sdwa v0, sext(v0), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX8-NEXT: v_max_i16_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_v2i8:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX10-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX10-NEXT: v_sub_nc_u16 v2, 0, v0
; GFX10-NEXT: v_sub_nc_u16 v3, 0, v1
; GFX10-NEXT: v_max_i16 v0, v0, v2
; GFX10-NEXT: v_max_i16 v1, v1, v3
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_v2i8:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -350,31 +402,60 @@ define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_max_i16 v0, v0, v2
; GFX1250-NEXT: v_max_i16 v1, v1, v3
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call <2 x i8> @llvm.abs.v2i8(<2 x i8> %arg, i1 false)
ret <2 x i8> %res
}
define amdgpu_cs <3 x i8> @abs_sgpr_v3i8(<3 x i8> inreg %arg) {
-; GFX-LABEL: abs_sgpr_v3i8:
-; GFX: ; %bb.0:
-; GFX-NEXT: s_sext_i32_i8 s0, s0
-; GFX-NEXT: s_sext_i32_i8 s1, s1
-; GFX-NEXT: s_sext_i32_i8 s2, s2
-; GFX-NEXT: s_abs_i32 s0, s0
-; GFX-NEXT: s_abs_i32 s1, s1
-; GFX-NEXT: s_abs_i32 s2, s2
-; GFX-NEXT: ; return to shader part epilog
+; GFX6-LABEL: abs_sgpr_v3i8:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_sext_i32_i8 s0, s0
+; GFX6-NEXT: s_sext_i32_i8 s1, s1
+; GFX6-NEXT: s_sext_i32_i8 s2, s2
+; GFX6-NEXT: s_abs_i32 s0, s0
+; GFX6-NEXT: s_abs_i32 s1, s1
+; GFX6-NEXT: s_abs_i32 s2, s2
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: abs_sgpr_v3i8:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_sext_i32_i8 s0, s0
+; GFX8-NEXT: s_sext_i32_i8 s1, s1
+; GFX8-NEXT: s_sext_i32_i8 s2, s2
+; GFX8-NEXT: s_abs_i32 s0, s0
+; GFX8-NEXT: s_abs_i32 s1, s1
+; GFX8-NEXT: s_abs_i32 s2, s2
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: abs_sgpr_v3i8:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_sext_i32_i8 s0, s0
+; GFX10-NEXT: s_sext_i32_i8 s1, s1
+; GFX10-NEXT: s_sext_i32_i8 s2, s2
+; GFX10-NEXT: s_abs_i32 s0, s0
+; GFX10-NEXT: s_abs_i32 s1, s1
+; GFX10-NEXT: s_abs_i32 s2, s2
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_sgpr_v3i8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_sext_i32_i8 s0, s0
+; GFX1250-NEXT: s_sext_i32_i8 s1, s1
+; GFX1250-NEXT: s_sext_i32_i8 s2, s2
+; GFX1250-NEXT: s_abs_i32 s0, s0
+; GFX1250-NEXT: s_abs_i32 s1, s1
+; GFX1250-NEXT: s_abs_i32 s2, s2
+; GFX1250-NEXT: ; return to shader part epilog
%res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %arg, i1 false)
ret <3 x i8> %res
}
-define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
+define <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
; GFX6-LABEL: abs_vgpr_v3i8:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8
@@ -384,13 +465,11 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
; GFX6-NEXT: v_max_i32_e32 v2, v2, v3
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: v_readfirstlane_b32 s2, v2
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_v3i8:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_sub_u16_sdwa v4, v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_max_i16_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -398,13 +477,11 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
; GFX8-NEXT: v_sub_u16_sdwa v3, v3, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_max_i16_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX8-NEXT: v_max_i16_sdwa v2, sext(v2), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
-; GFX8-NEXT: v_readfirstlane_b32 s2, v2
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_v3i8:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX10-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX10-NEXT: v_bfe_i32 v2, v2, 0, 8
@@ -414,13 +491,12 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
; GFX10-NEXT: v_max_i16 v0, v0, v3
; GFX10-NEXT: v_max_i16 v1, v1, v4
; GFX10-NEXT: v_max_i16 v2, v2, v5
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: v_readfirstlane_b32 s2, v2
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_v3i8:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX1250-NEXT: v_bfe_i32 v2, v2, 0, 8
@@ -433,12 +509,7 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_max_i16 v1, v1, v4
; GFX1250-NEXT: v_max_i16 v2, v2, v5
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX1250-NEXT: v_readfirstlane_b32 s2, v2
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %arg, i1 false)
ret <3 x i8> %res
}
@@ -474,6 +545,7 @@ define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) {
;
; GFX1250-LABEL: abs_sgpr_v2i16:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_sext_i32_i16 s1, s0
; GFX1250-NEXT: s_ashr_i32 s0, s0, 16
; GFX1250-NEXT: s_abs_i32 s1, s1
@@ -485,44 +557,44 @@ define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) {
ret <2 x i16> %res
}
-define amdgpu_cs <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) {
+define <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) {
; GFX6-LABEL: abs_vgpr_v2i16:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_max_i32_e32 v0, v0, v2
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1
; GFX6-NEXT: v_max_i32_e32 v1, v1, v2
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_v2i16:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: v_sub_u16_e32 v1, 0, v0
; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_max_i16_e32 v1, v0, v1
; GFX8-NEXT: v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_v2i16:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v1, 0, v0
; GFX10-NEXT: v_pk_max_i16 v0, v0, v1
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_v2i16:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_pk_sub_i16 v1, 0, v0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_pk_max_i16 v0, v0, v1
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %arg, i1 false)
ret <2 x i16> %res
}
@@ -564,6 +636,7 @@ define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) {
;
; GFX1250-LABEL: abs_sgpr_v3i16:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_sext_i32_i16 s2, s0
; GFX1250-NEXT: s_ashr_i32 s0, s0, 16
; GFX1250-NEXT: s_abs_i32 s2, s2
@@ -576,9 +649,10 @@ define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) {
ret <3 x i16> %res
}
-define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
+define <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
; GFX6-LABEL: abs_vgpr_v3i16:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
@@ -588,13 +662,11 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
; GFX6-NEXT: v_max_i32_e32 v2, v2, v3
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: v_readfirstlane_b32 s2, v2
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_v3i16:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_sub_u16_e32 v2, 0, v0
; GFX8-NEXT: v_sub_u16_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -603,31 +675,29 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
; GFX8-NEXT: v_max_i16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
; GFX8-NEXT: v_max_i16_e32 v1, v1, v4
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_v3i16:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v2, 0, v0
; GFX10-NEXT: v_sub_nc_u16 v3, 0, v1
; GFX10-NEXT: v_pk_max_i16 v0, v0, v2
; GFX10-NEXT: v_max_i16 v1, v1, v3
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_v3i16:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_pk_sub_i16 v2, 0, v0
; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_pk_max_i16 v0, v0, v2
; GFX1250-NEXT: v_max_i16 v1, v1, v3
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false)
ret <3 x i16> %res
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
index 390f62d..714e8e4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
@@ -16,7 +16,7 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
; GFX10-NEXT: .LBB0_2: ; %bb
; GFX10-NEXT: s_load_dword s0, s[8:9], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: global_store_dword v[0:1], v0, off
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
deleted file mode 100644
index 70bfb2e..0000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
+++ /dev/null
@@ -1,213 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefix=GFX10
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s -check-prefix=GFX10
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GFX11
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12
-
-define i32 @global_atomic_csub(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %data)
- ret i32 %ret
-}
-
-define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_offset:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub_offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub_offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
- ret i32 %ret
-}
-
-define void @global_atomic_csub_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_nortn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub_nortn:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub_nortn:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %data)
- ret void
-}
-
-define void @global_atomic_csub_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_offset_nortn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub_offset_nortn:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub_offset_nortn:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_sgpr_base_offset:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX10-NEXT: v_mov_b32_e32 v1, 0x1000
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: global_atomic_csub v0, v1, v0, s[0:1] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v[0:1], v0, off
-; GFX10-NEXT: s_endpgm
-;
-; GFX11-LABEL: global_atomic_csub_sgpr_base_offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_store_b32 v[0:1], v0, off
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-LABEL: global_atomic_csub_sgpr_base_offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v[0:1], v0, off
-; GFX12-NEXT: s_endpgm
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
- store i32 %ret, ptr addrspace(1) poison
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_sgpr_base_offset_nortn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX10-NEXT: v_mov_b32_e32 v1, 0x1000
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: global_atomic_csub v0, v1, v0, s[0:1] glc
-; GFX10-NEXT: s_endpgm
-;
-; GFX11-LABEL: global_atomic_csub_sgpr_base_offset_nortn:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-LABEL: global_atomic_csub_sgpr_base_offset_nortn:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_endpgm
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
- ret void
-}
-
-declare i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) nocapture, i32) #1
-
-attributes #0 = { nounwind willreturn }
-attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index b0ca1e8..c23afeb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -144,43 +144,41 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float
define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) {
; GFX1030-LABEL: image_bvh_intersect_ray_vgpr_descr:
; GFX1030: ; %bb.0:
-; GFX1030-NEXT: v_mov_b32_e32 v21, v0
-; GFX1030-NEXT: v_mov_b32_e32 v22, v1
-; GFX1030-NEXT: v_mov_b32_e32 v23, v2
-; GFX1030-NEXT: v_mov_b32_e32 v24, v3
-; GFX1030-NEXT: v_mov_b32_e32 v25, v4
-; GFX1030-NEXT: v_mov_b32_e32 v26, v5
-; GFX1030-NEXT: v_mov_b32_e32 v27, v6
-; GFX1030-NEXT: v_mov_b32_e32 v28, v7
-; GFX1030-NEXT: v_mov_b32_e32 v29, v8
-; GFX1030-NEXT: v_mov_b32_e32 v30, v9
-; GFX1030-NEXT: v_mov_b32_e32 v31, v10
-; GFX1030-NEXT: v_mov_b32_e32 v19, v11
-; GFX1030-NEXT: v_mov_b32_e32 v20, v12
+; GFX1030-NEXT: v_mov_b32_e32 v15, v0
+; GFX1030-NEXT: v_mov_b32_e32 v16, v1
+; GFX1030-NEXT: v_mov_b32_e32 v17, v2
+; GFX1030-NEXT: v_mov_b32_e32 v18, v3
+; GFX1030-NEXT: v_mov_b32_e32 v19, v4
+; GFX1030-NEXT: v_mov_b32_e32 v20, v5
+; GFX1030-NEXT: v_mov_b32_e32 v21, v6
+; GFX1030-NEXT: v_mov_b32_e32 v22, v7
+; GFX1030-NEXT: v_mov_b32_e32 v23, v8
+; GFX1030-NEXT: v_mov_b32_e32 v24, v9
+; GFX1030-NEXT: v_mov_b32_e32 v25, v10
; GFX1030-NEXT: s_mov_b32 s1, exec_lo
; GFX1030-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
-; GFX1030-NEXT: v_readfirstlane_b32 s4, v19
-; GFX1030-NEXT: v_readfirstlane_b32 s5, v20
+; GFX1030-NEXT: v_readfirstlane_b32 s4, v11
+; GFX1030-NEXT: v_readfirstlane_b32 s5, v12
; GFX1030-NEXT: v_readfirstlane_b32 s6, v13
; GFX1030-NEXT: v_readfirstlane_b32 s7, v14
-; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[19:20]
+; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1030-NEXT: s_and_saveexec_b32 s0, s0
-; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[21:31], s[4:7]
+; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[15:25], s[4:7]
+; GFX1030-NEXT: ; implicit-def: $vgpr11
+; GFX1030-NEXT: ; implicit-def: $vgpr15
+; GFX1030-NEXT: ; implicit-def: $vgpr16
+; GFX1030-NEXT: ; implicit-def: $vgpr17
+; GFX1030-NEXT: ; implicit-def: $vgpr18
; GFX1030-NEXT: ; implicit-def: $vgpr19
+; GFX1030-NEXT: ; implicit-def: $vgpr20
; GFX1030-NEXT: ; implicit-def: $vgpr21
; GFX1030-NEXT: ; implicit-def: $vgpr22
; GFX1030-NEXT: ; implicit-def: $vgpr23
; GFX1030-NEXT: ; implicit-def: $vgpr24
; GFX1030-NEXT: ; implicit-def: $vgpr25
-; GFX1030-NEXT: ; implicit-def: $vgpr26
-; GFX1030-NEXT: ; implicit-def: $vgpr27
-; GFX1030-NEXT: ; implicit-def: $vgpr28
-; GFX1030-NEXT: ; implicit-def: $vgpr29
-; GFX1030-NEXT: ; implicit-def: $vgpr30
-; GFX1030-NEXT: ; implicit-def: $vgpr31
-; GFX1030-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14
+; GFX1030-NEXT: ; implicit-def: $vgpr13_vgpr14
; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1030-NEXT: s_cbranch_execnz .LBB6_1
; GFX1030-NEXT: ; %bb.2:
@@ -190,23 +188,21 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
;
; GFX1013-LABEL: image_bvh_intersect_ray_vgpr_descr:
; GFX1013: ; %bb.0:
-; GFX1013-NEXT: v_mov_b32_e32 v19, v11
-; GFX1013-NEXT: v_mov_b32_e32 v20, v12
; GFX1013-NEXT: s_mov_b32 s1, exec_lo
; GFX1013-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
-; GFX1013-NEXT: v_readfirstlane_b32 s4, v19
-; GFX1013-NEXT: v_readfirstlane_b32 s5, v20
+; GFX1013-NEXT: v_readfirstlane_b32 s4, v11
+; GFX1013-NEXT: v_readfirstlane_b32 s5, v12
; GFX1013-NEXT: v_readfirstlane_b32 s6, v13
; GFX1013-NEXT: v_readfirstlane_b32 s7, v14
-; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[19:20]
+; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1013-NEXT: s_and_saveexec_b32 s0, s0
; GFX1013-NEXT: image_bvh_intersect_ray v[15:18], v[0:10], s[4:7]
-; GFX1013-NEXT: ; implicit-def: $vgpr19
+; GFX1013-NEXT: ; implicit-def: $vgpr11
; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10
-; GFX1013-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14
-; GFX1013-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1013-NEXT: ; implicit-def: $vgpr13_vgpr14
+; GFX1013-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1013-NEXT: s_cbranch_execnz .LBB6_1
; GFX1013-NEXT: ; %bb.2:
@@ -220,31 +216,29 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
;
; GFX11-LABEL: image_bvh_intersect_ray_vgpr_descr:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_dual_mov_b32 v20, v0 :: v_dual_mov_b32 v21, v1
+; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v19, v1
; GFX11-NEXT: v_dual_mov_b32 v15, v2 :: v_dual_mov_b32 v16, v3
-; GFX11-NEXT: v_dual_mov_b32 v17, v4 :: v_dual_mov_b32 v18, v11
-; GFX11-NEXT: v_mov_b32_e32 v19, v12
+; GFX11-NEXT: v_mov_b32_e32 v17, v4
; GFX11-NEXT: s_mov_b32 s1, exec_lo
; GFX11-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_readfirstlane_b32 s4, v18
-; GFX11-NEXT: v_readfirstlane_b32 s5, v19
+; GFX11-NEXT: v_readfirstlane_b32 s4, v11
+; GFX11-NEXT: v_readfirstlane_b32 s5, v12
; GFX11-NEXT: v_readfirstlane_b32 s6, v13
; GFX11-NEXT: v_readfirstlane_b32 s7, v14
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19]
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v20, v21, v[15:17], v[5:7], v[8:10]], s[4:7]
+; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v18, v19, v[15:17], v[5:7], v[8:10]], s[4:7]
+; GFX11-NEXT: ; implicit-def: $vgpr11
; GFX11-NEXT: ; implicit-def: $vgpr18
-; GFX11-NEXT: ; implicit-def: $vgpr20
-; GFX11-NEXT: ; implicit-def: $vgpr21
+; GFX11-NEXT: ; implicit-def: $vgpr19
; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17
; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7
; GFX11-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10
-; GFX11-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14
+; GFX11-NEXT: ; implicit-def: $vgpr13_vgpr14
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB6_1
; GFX11-NEXT: ; %bb.2:
@@ -259,42 +253,40 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) {
; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
; GFX1030: ; %bb.0:
-; GFX1030-NEXT: v_mov_b32_e32 v18, v0
-; GFX1030-NEXT: v_mov_b32_e32 v19, v1
+; GFX1030-NEXT: v_mov_b32_e32 v13, v0
+; GFX1030-NEXT: v_mov_b32_e32 v14, v1
; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v5
; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v7
-; GFX1030-NEXT: v_mov_b32_e32 v20, v2
+; GFX1030-NEXT: v_mov_b32_e32 v15, v2
; GFX1030-NEXT: v_and_b32_e32 v2, 0xffff, v8
-; GFX1030-NEXT: v_mov_b32_e32 v21, v3
+; GFX1030-NEXT: v_mov_b32_e32 v16, v3
; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT: v_mov_b32_e32 v22, v4
-; GFX1030-NEXT: v_mov_b32_e32 v16, v9
-; GFX1030-NEXT: v_mov_b32_e32 v17, v10
-; GFX1030-NEXT: v_and_or_b32 v23, 0xffff, v5, v0
-; GFX1030-NEXT: v_and_or_b32 v24, 0xffff, v6, v1
-; GFX1030-NEXT: v_alignbit_b32 v25, v2, v7, 16
+; GFX1030-NEXT: v_mov_b32_e32 v17, v4
+; GFX1030-NEXT: v_alignbit_b32 v20, v2, v7, 16
; GFX1030-NEXT: s_mov_b32 s1, exec_lo
+; GFX1030-NEXT: v_and_or_b32 v18, 0xffff, v5, v0
+; GFX1030-NEXT: v_and_or_b32 v19, 0xffff, v6, v1
; GFX1030-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1030-NEXT: v_readfirstlane_b32 s4, v16
-; GFX1030-NEXT: v_readfirstlane_b32 s5, v17
+; GFX1030-NEXT: v_readfirstlane_b32 s4, v9
+; GFX1030-NEXT: v_readfirstlane_b32 s5, v10
; GFX1030-NEXT: v_readfirstlane_b32 s6, v11
; GFX1030-NEXT: v_readfirstlane_b32 s7, v12
-; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17]
+; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12]
; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1030-NEXT: s_and_saveexec_b32 s0, s0
-; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[18:25], s[4:7] a16
+; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[13:20], s[4:7] a16
+; GFX1030-NEXT: ; implicit-def: $vgpr9
+; GFX1030-NEXT: ; implicit-def: $vgpr13
+; GFX1030-NEXT: ; implicit-def: $vgpr14
+; GFX1030-NEXT: ; implicit-def: $vgpr15
; GFX1030-NEXT: ; implicit-def: $vgpr16
+; GFX1030-NEXT: ; implicit-def: $vgpr17
; GFX1030-NEXT: ; implicit-def: $vgpr18
; GFX1030-NEXT: ; implicit-def: $vgpr19
; GFX1030-NEXT: ; implicit-def: $vgpr20
-; GFX1030-NEXT: ; implicit-def: $vgpr21
-; GFX1030-NEXT: ; implicit-def: $vgpr22
-; GFX1030-NEXT: ; implicit-def: $vgpr23
-; GFX1030-NEXT: ; implicit-def: $vgpr24
-; GFX1030-NEXT: ; implicit-def: $vgpr25
-; GFX1030-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12
+; GFX1030-NEXT: ; implicit-def: $vgpr11_vgpr12
; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1030-NEXT: s_cbranch_execnz .LBB7_1
; GFX1030-NEXT: ; %bb.2:
@@ -304,31 +296,29 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
;
; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
; GFX1013: ; %bb.0:
-; GFX1013-NEXT: v_mov_b32_e32 v17, v9
-; GFX1013-NEXT: v_mov_b32_e32 v18, v10
-; GFX1013-NEXT: v_lshrrev_b32_e32 v9, 16, v5
-; GFX1013-NEXT: v_and_b32_e32 v10, 0xffff, v7
+; GFX1013-NEXT: v_lshrrev_b32_e32 v13, 16, v5
+; GFX1013-NEXT: v_and_b32_e32 v14, 0xffff, v7
; GFX1013-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GFX1013-NEXT: s_mov_b32 s1, exec_lo
-; GFX1013-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX1013-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX1013-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX1013-NEXT: v_alignbit_b32 v7, v8, v7, 16
-; GFX1013-NEXT: v_and_or_b32 v5, 0xffff, v5, v9
-; GFX1013-NEXT: v_and_or_b32 v6, 0xffff, v6, v10
+; GFX1013-NEXT: v_and_or_b32 v5, 0xffff, v5, v13
+; GFX1013-NEXT: v_and_or_b32 v6, 0xffff, v6, v14
; GFX1013-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1013-NEXT: v_readfirstlane_b32 s4, v17
-; GFX1013-NEXT: v_readfirstlane_b32 s5, v18
+; GFX1013-NEXT: v_readfirstlane_b32 s4, v9
+; GFX1013-NEXT: v_readfirstlane_b32 s5, v10
; GFX1013-NEXT: v_readfirstlane_b32 s6, v11
; GFX1013-NEXT: v_readfirstlane_b32 s7, v12
-; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[17:18]
+; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12]
; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1013-NEXT: s_and_saveexec_b32 s0, s0
; GFX1013-NEXT: image_bvh_intersect_ray v[13:16], v[0:7], s[4:7] a16
-; GFX1013-NEXT: ; implicit-def: $vgpr17
+; GFX1013-NEXT: ; implicit-def: $vgpr9
; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-; GFX1013-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12
-; GFX1013-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1013-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1013-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1013-NEXT: s_cbranch_execnz .LBB7_1
; GFX1013-NEXT: ; %bb.2:
@@ -343,33 +333,32 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
; GFX11-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
-; GFX11-NEXT: v_dual_mov_b32 v19, v10 :: v_dual_and_b32 v0, 0xffff, v7
+; GFX11-NEXT: v_dual_mov_b32 v15, v4 :: v_dual_and_b32 v0, 0xffff, v7
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v8
; GFX11-NEXT: v_dual_mov_b32 v13, v2 :: v_dual_mov_b32 v14, v3
-; GFX11-NEXT: v_dual_mov_b32 v15, v4 :: v_dual_mov_b32 v18, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_lshl_or_b32 v4, v5, 16, v0
; GFX11-NEXT: v_perm_b32 v5, v5, v7, 0x7060302
; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v1
-; GFX11-NEXT: s_mov_b32 s1, exec_lo
; GFX11-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_readfirstlane_b32 s4, v18
-; GFX11-NEXT: v_readfirstlane_b32 s5, v19
+; GFX11-NEXT: v_readfirstlane_b32 s4, v9
+; GFX11-NEXT: v_readfirstlane_b32 s5, v10
; GFX11-NEXT: v_readfirstlane_b32 s6, v11
; GFX11-NEXT: v_readfirstlane_b32 s7, v12
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19]
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12]
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v16, v17, v[13:15], v[4:6]], s[4:7] a16
-; GFX11-NEXT: ; implicit-def: $vgpr18
+; GFX11-NEXT: ; implicit-def: $vgpr9
; GFX11-NEXT: ; implicit-def: $vgpr16
; GFX11-NEXT: ; implicit-def: $vgpr17
; GFX11-NEXT: ; implicit-def: $vgpr13_vgpr14_vgpr15
; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6
-; GFX11-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12
+; GFX11-NEXT: ; implicit-def: $vgpr11_vgpr12
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB7_1
; GFX11-NEXT: ; %bb.2:
@@ -384,45 +373,43 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) {
; GFX1030-LABEL: image_bvh64_intersect_ray_vgpr_descr:
; GFX1030: ; %bb.0:
-; GFX1030-NEXT: v_mov_b32_e32 v22, v0
-; GFX1030-NEXT: v_mov_b32_e32 v23, v1
-; GFX1030-NEXT: v_mov_b32_e32 v24, v2
-; GFX1030-NEXT: v_mov_b32_e32 v25, v3
-; GFX1030-NEXT: v_mov_b32_e32 v26, v4
-; GFX1030-NEXT: v_mov_b32_e32 v27, v5
-; GFX1030-NEXT: v_mov_b32_e32 v28, v6
-; GFX1030-NEXT: v_mov_b32_e32 v29, v7
-; GFX1030-NEXT: v_mov_b32_e32 v30, v8
-; GFX1030-NEXT: v_mov_b32_e32 v31, v9
-; GFX1030-NEXT: v_mov_b32_e32 v32, v10
-; GFX1030-NEXT: v_mov_b32_e32 v33, v11
-; GFX1030-NEXT: v_mov_b32_e32 v20, v12
-; GFX1030-NEXT: v_mov_b32_e32 v21, v13
+; GFX1030-NEXT: v_mov_b32_e32 v16, v0
+; GFX1030-NEXT: v_mov_b32_e32 v17, v1
+; GFX1030-NEXT: v_mov_b32_e32 v18, v2
+; GFX1030-NEXT: v_mov_b32_e32 v19, v3
+; GFX1030-NEXT: v_mov_b32_e32 v20, v4
+; GFX1030-NEXT: v_mov_b32_e32 v21, v5
+; GFX1030-NEXT: v_mov_b32_e32 v22, v6
+; GFX1030-NEXT: v_mov_b32_e32 v23, v7
+; GFX1030-NEXT: v_mov_b32_e32 v24, v8
+; GFX1030-NEXT: v_mov_b32_e32 v25, v9
+; GFX1030-NEXT: v_mov_b32_e32 v26, v10
+; GFX1030-NEXT: v_mov_b32_e32 v27, v11
; GFX1030-NEXT: s_mov_b32 s1, exec_lo
; GFX1030-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX1030-NEXT: v_readfirstlane_b32 s4, v20
-; GFX1030-NEXT: v_readfirstlane_b32 s5, v21
+; GFX1030-NEXT: v_readfirstlane_b32 s4, v12
+; GFX1030-NEXT: v_readfirstlane_b32 s5, v13
; GFX1030-NEXT: v_readfirstlane_b32 s6, v14
; GFX1030-NEXT: v_readfirstlane_b32 s7, v15
-; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[20:21]
+; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13]
; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15]
; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1030-NEXT: s_and_saveexec_b32 s0, s0
-; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[22:33], s[4:7]
+; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[16:27], s[4:7]
+; GFX1030-NEXT: ; implicit-def: $vgpr12
+; GFX1030-NEXT: ; implicit-def: $vgpr16
+; GFX1030-NEXT: ; implicit-def: $vgpr17
+; GFX1030-NEXT: ; implicit-def: $vgpr18
+; GFX1030-NEXT: ; implicit-def: $vgpr19
; GFX1030-NEXT: ; implicit-def: $vgpr20
+; GFX1030-NEXT: ; implicit-def: $vgpr21
; GFX1030-NEXT: ; implicit-def: $vgpr22
; GFX1030-NEXT: ; implicit-def: $vgpr23
; GFX1030-NEXT: ; implicit-def: $vgpr24
; GFX1030-NEXT: ; implicit-def: $vgpr25
; GFX1030-NEXT: ; implicit-def: $vgpr26
; GFX1030-NEXT: ; implicit-def: $vgpr27
-; GFX1030-NEXT: ; implicit-def: $vgpr28
-; GFX1030-NEXT: ; implicit-def: $vgpr29
-; GFX1030-NEXT: ; implicit-def: $vgpr30
-; GFX1030-NEXT: ; implicit-def: $vgpr31
-; GFX1030-NEXT: ; implicit-def: $vgpr32
-; GFX1030-NEXT: ; implicit-def: $vgpr33
-; GFX1030-NEXT: ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15
+; GFX1030-NEXT: ; implicit-def: $vgpr14_vgpr15
; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1030-NEXT: s_cbranch_execnz .LBB8_1
; GFX1030-NEXT: ; %bb.2:
@@ -432,23 +419,21 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr
;
; GFX1013-LABEL: image_bvh64_intersect_ray_vgpr_descr:
; GFX1013: ; %bb.0:
-; GFX1013-NEXT: v_mov_b32_e32 v20, v12
-; GFX1013-NEXT: v_mov_b32_e32 v21, v13
; GFX1013-NEXT: s_mov_b32 s1, exec_lo
; GFX1013-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX1013-NEXT: v_readfirstlane_b32 s4, v20
-; GFX1013-NEXT: v_readfirstlane_b32 s5, v21
+; GFX1013-NEXT: v_readfirstlane_b32 s4, v12
+; GFX1013-NEXT: v_readfirstlane_b32 s5, v13
; GFX1013-NEXT: v_readfirstlane_b32 s6, v14
; GFX1013-NEXT: v_readfirstlane_b32 s7, v15
-; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[20:21]
+; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13]
; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15]
; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1013-NEXT: s_and_saveexec_b32 s0, s0
; GFX1013-NEXT: image_bvh64_intersect_ray v[16:19], v[0:11], s[4:7]
-; GFX1013-NEXT: ; implicit-def: $vgpr20
+; GFX1013-NEXT: ; implicit-def: $vgpr12
; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
-; GFX1013-NEXT: ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15
-; GFX1013-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1013-NEXT: ; implicit-def: $vgpr14_vgpr15
+; GFX1013-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1013-NEXT: s_cbranch_execnz .LBB8_1
; GFX1013-NEXT: ; %bb.2:
@@ -465,28 +450,26 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr
; GFX11-NEXT: v_dual_mov_b32 v19, v0 :: v_dual_mov_b32 v20, v1
; GFX11-NEXT: v_dual_mov_b32 v21, v2 :: v_dual_mov_b32 v16, v3
; GFX11-NEXT: v_dual_mov_b32 v17, v4 :: v_dual_mov_b32 v18, v5
-; GFX11-NEXT: v_dual_mov_b32 v4, v12 :: v_dual_mov_b32 v5, v13
; GFX11-NEXT: s_mov_b32 s1, exec_lo
; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_readfirstlane_b32 s4, v4
-; GFX11-NEXT: v_readfirstlane_b32 s5, v5
+; GFX11-NEXT: v_readfirstlane_b32 s4, v12
+; GFX11-NEXT: v_readfirstlane_b32 s5, v13
; GFX11-NEXT: v_readfirstlane_b32 s6, v14
; GFX11-NEXT: v_readfirstlane_b32 s7, v15
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15]
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[19:20], v21, v[16:18], v[6:8], v[9:11]], s[4:7]
-; GFX11-NEXT: ; implicit-def: $vgpr4
+; GFX11-NEXT: ; implicit-def: $vgpr12
; GFX11-NEXT: ; implicit-def: $vgpr19_vgpr20
; GFX11-NEXT: ; implicit-def: $vgpr21
; GFX11-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18
; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8
; GFX11-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11
-; GFX11-NEXT: ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-NEXT: ; implicit-def: $vgpr14_vgpr15
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB8_1
; GFX11-NEXT: ; %bb.2:
@@ -501,44 +484,42 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) {
; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
; GFX1030: ; %bb.0:
-; GFX1030-NEXT: v_mov_b32_e32 v19, v0
-; GFX1030-NEXT: v_mov_b32_e32 v20, v1
+; GFX1030-NEXT: v_mov_b32_e32 v14, v0
+; GFX1030-NEXT: v_mov_b32_e32 v15, v1
; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v6
; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v8
-; GFX1030-NEXT: v_mov_b32_e32 v21, v2
+; GFX1030-NEXT: v_mov_b32_e32 v16, v2
; GFX1030-NEXT: v_and_b32_e32 v2, 0xffff, v9
-; GFX1030-NEXT: v_mov_b32_e32 v22, v3
+; GFX1030-NEXT: v_mov_b32_e32 v17, v3
; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT: v_mov_b32_e32 v23, v4
-; GFX1030-NEXT: v_mov_b32_e32 v24, v5
-; GFX1030-NEXT: v_mov_b32_e32 v17, v10
-; GFX1030-NEXT: v_mov_b32_e32 v18, v11
-; GFX1030-NEXT: v_and_or_b32 v25, 0xffff, v6, v0
-; GFX1030-NEXT: v_and_or_b32 v26, 0xffff, v7, v1
-; GFX1030-NEXT: v_alignbit_b32 v27, v2, v8, 16
+; GFX1030-NEXT: v_mov_b32_e32 v18, v4
+; GFX1030-NEXT: v_mov_b32_e32 v19, v5
+; GFX1030-NEXT: v_alignbit_b32 v22, v2, v8, 16
+; GFX1030-NEXT: v_and_or_b32 v20, 0xffff, v6, v0
+; GFX1030-NEXT: v_and_or_b32 v21, 0xffff, v7, v1
; GFX1030-NEXT: s_mov_b32 s1, exec_lo
; GFX1030-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
-; GFX1030-NEXT: v_readfirstlane_b32 s4, v17
-; GFX1030-NEXT: v_readfirstlane_b32 s5, v18
+; GFX1030-NEXT: v_readfirstlane_b32 s4, v10
+; GFX1030-NEXT: v_readfirstlane_b32 s5, v11
; GFX1030-NEXT: v_readfirstlane_b32 s6, v12
; GFX1030-NEXT: v_readfirstlane_b32 s7, v13
-; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[17:18]
+; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1030-NEXT: s_and_saveexec_b32 s0, s0
-; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[19:27], s[4:7] a16
+; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[14:22], s[4:7] a16
+; GFX1030-NEXT: ; implicit-def: $vgpr10
+; GFX1030-NEXT: ; implicit-def: $vgpr14
+; GFX1030-NEXT: ; implicit-def: $vgpr15
+; GFX1030-NEXT: ; implicit-def: $vgpr16
; GFX1030-NEXT: ; implicit-def: $vgpr17
+; GFX1030-NEXT: ; implicit-def: $vgpr18
; GFX1030-NEXT: ; implicit-def: $vgpr19
; GFX1030-NEXT: ; implicit-def: $vgpr20
; GFX1030-NEXT: ; implicit-def: $vgpr21
; GFX1030-NEXT: ; implicit-def: $vgpr22
-; GFX1030-NEXT: ; implicit-def: $vgpr23
-; GFX1030-NEXT: ; implicit-def: $vgpr24
-; GFX1030-NEXT: ; implicit-def: $vgpr25
-; GFX1030-NEXT: ; implicit-def: $vgpr26
-; GFX1030-NEXT: ; implicit-def: $vgpr27
-; GFX1030-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13
+; GFX1030-NEXT: ; implicit-def: $vgpr12_vgpr13
; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1030-NEXT: s_cbranch_execnz .LBB9_1
; GFX1030-NEXT: ; %bb.2:
@@ -548,31 +529,29 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
;
; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
; GFX1013: ; %bb.0:
-; GFX1013-NEXT: v_mov_b32_e32 v18, v10
-; GFX1013-NEXT: v_mov_b32_e32 v19, v11
-; GFX1013-NEXT: v_lshrrev_b32_e32 v10, 16, v6
-; GFX1013-NEXT: v_and_b32_e32 v11, 0xffff, v8
+; GFX1013-NEXT: v_lshrrev_b32_e32 v14, 16, v6
+; GFX1013-NEXT: v_and_b32_e32 v15, 0xffff, v8
; GFX1013-NEXT: v_and_b32_e32 v9, 0xffff, v9
; GFX1013-NEXT: s_mov_b32 s1, exec_lo
-; GFX1013-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX1013-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX1013-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX1013-NEXT: v_alignbit_b32 v8, v9, v8, 16
-; GFX1013-NEXT: v_and_or_b32 v6, 0xffff, v6, v10
-; GFX1013-NEXT: v_and_or_b32 v7, 0xffff, v7, v11
+; GFX1013-NEXT: v_and_or_b32 v6, 0xffff, v6, v14
+; GFX1013-NEXT: v_and_or_b32 v7, 0xffff, v7, v15
; GFX1013-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
-; GFX1013-NEXT: v_readfirstlane_b32 s4, v18
-; GFX1013-NEXT: v_readfirstlane_b32 s5, v19
+; GFX1013-NEXT: v_readfirstlane_b32 s4, v10
+; GFX1013-NEXT: v_readfirstlane_b32 s5, v11
; GFX1013-NEXT: v_readfirstlane_b32 s6, v12
; GFX1013-NEXT: v_readfirstlane_b32 s7, v13
-; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19]
+; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1013-NEXT: s_and_saveexec_b32 s0, s0
; GFX1013-NEXT: image_bvh64_intersect_ray v[14:17], v[0:8], s[4:7] a16
-; GFX1013-NEXT: ; implicit-def: $vgpr18
+; GFX1013-NEXT: ; implicit-def: $vgpr10
; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8
-; GFX1013-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13
-; GFX1013-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1013-NEXT: ; implicit-def: $vgpr12_vgpr13
+; GFX1013-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1013-NEXT: s_cbranch_execnz .LBB9_1
; GFX1013-NEXT: ; %bb.2:
@@ -591,29 +570,29 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v9
; GFX11-NEXT: v_dual_mov_b32 v19, v2 :: v_dual_mov_b32 v14, v3
; GFX11-NEXT: v_dual_mov_b32 v15, v4 :: v_dual_mov_b32 v16, v5
-; GFX11-NEXT: v_dual_mov_b32 v4, v10 :: v_dual_mov_b32 v5, v11
-; GFX11-NEXT: v_lshl_or_b32 v20, v6, 16, v0
-; GFX11-NEXT: v_perm_b32 v21, v6, v8, 0x7060302
-; GFX11-NEXT: v_lshl_or_b32 v22, v7, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_lshl_or_b32 v4, v6, 16, v0
+; GFX11-NEXT: v_perm_b32 v5, v6, v8, 0x7060302
+; GFX11-NEXT: v_lshl_or_b32 v6, v7, 16, v1
; GFX11-NEXT: s_mov_b32 s1, exec_lo
; GFX11-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_readfirstlane_b32 s4, v4
-; GFX11-NEXT: v_readfirstlane_b32 s5, v5
+; GFX11-NEXT: v_readfirstlane_b32 s4, v10
+; GFX11-NEXT: v_readfirstlane_b32 s5, v11
; GFX11-NEXT: v_readfirstlane_b32 s6, v12
; GFX11-NEXT: v_readfirstlane_b32 s7, v13
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[17:18], v19, v[14:16], v[20:22]], s[4:7] a16
-; GFX11-NEXT: ; implicit-def: $vgpr4
+; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[17:18], v19, v[14:16], v[4:6]], s[4:7] a16
+; GFX11-NEXT: ; implicit-def: $vgpr10
; GFX11-NEXT: ; implicit-def: $vgpr17_vgpr18
; GFX11-NEXT: ; implicit-def: $vgpr19
; GFX11-NEXT: ; implicit-def: $vgpr14_vgpr15_vgpr16
-; GFX11-NEXT: ; implicit-def: $vgpr20_vgpr21_vgpr22
-; GFX11-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6
+; GFX11-NEXT: ; implicit-def: $vgpr12_vgpr13
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB9_1
; GFX11-NEXT: ; %bb.2:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
index 5720b88..cc21305 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN %s
+; RUN: llc -verify-machineinstrs -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN %s
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32)
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
index 50377e9..26cdbb1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
@@ -171,7 +171,7 @@ define float @v_rsq_clamp_undef_f32() #0 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_s_rsq_f32 s0, s0
; GFX12-NEXT: v_mov_b32_e32 v0, 0xff7fffff
-; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_minmax_num_f32 v0, s0, 0x7f7fffff, v0
; GFX12-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index e411c23..7b5621f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -27,11 +27,11 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) {
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 1
-; GCN-NEXT: v_mov_b32_e32 v0, v0
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0
%tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
@@ -68,12 +68,12 @@ define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) {
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 1
; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: v_mov_b32_e32 v0, v0
-; GCN-NEXT: v_mov_b32_e32 v1, v1
+; GCN-NEXT: v_mov_b32_e32 v2, v0
+; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0
%tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
index e0016b0..993fb7e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
@@ -12,7 +12,7 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src)
; LOOP-NEXT: s_mov_b32 s3, 0xf000
; LOOP-NEXT: v_mov_b32_e32 v5, s1
; LOOP-NEXT: v_mov_b32_e32 v4, s0
-; LOOP-NEXT: .LBB0_1: ; %load-store-loop
+; LOOP-NEXT: .LBB0_1: ; %static-memcpy-expansion-main-body
; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1
; LOOP-NEXT: v_add_i32_e32 v6, vcc, v2, v4
; LOOP-NEXT: v_addc_u32_e32 v7, vcc, v3, v5, vcc
@@ -177,7 +177,7 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src)
; LOOP-NEXT: buffer_store_byte v30, v[6:7], s[0:3], 0 addr64 offset:30
; LOOP-NEXT: buffer_store_byte v13, v[6:7], s[0:3], 0 addr64 offset:31
; LOOP-NEXT: s_cbranch_vccnz .LBB0_1
-; LOOP-NEXT: ; %bb.2: ; %memcpy-split
+; LOOP-NEXT: ; %bb.2: ; %static-memcpy-post-expansion
; LOOP-NEXT: s_mov_b32 s2, 0
; LOOP-NEXT: s_mov_b32 s3, 0xf000
; LOOP-NEXT: s_mov_b64 s[0:1], 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
index fe002d6..46070dd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
@@ -44,7 +44,7 @@ define i16 @v_powi_f16(i16 %l, i32 %r) {
; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
-; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v0.l
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.h
; GFX11-TRUE16-NEXT: v_mul_dx9_zero_f32_e32 v0, v1, v0
@@ -60,7 +60,7 @@ define i16 @v_powi_f16(i16 %l, i32 %r) {
; GFX11-FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX11-FAKE16-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1
@@ -137,7 +137,7 @@ define float @v_powi_f32(float %l, i32 %r) {
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_log_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -147,7 +147,7 @@ define float @v_powi_f32(float %l, i32 %r) {
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%res = call float @llvm.powi.f32.i32(float %l, i32 %r)
@@ -224,7 +224,7 @@ define float @v_powi_neg1_f32(float %l) {
; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_f32_e32 v2, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0
; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -300,7 +300,7 @@ define float @v_powi_neg2_f32(float %l) {
; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0
; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
; GFX11-NEXT: v_rcp_f32_e32 v2, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2
@@ -477,7 +477,7 @@ define float @v_powi_neg128_f32(float %l) {
; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0
; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
; GFX11-NEXT: v_rcp_f32_e32 v2, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
index efa51ea..1de5e13 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
@@ -930,6 +930,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg
;
; GFX1250-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
; GFX1250-UNALIGNED: ; %bb.0:
+; GFX1250-UNALIGNED-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-UNALIGNED-NEXT: global_load_b96 v[0:2], v0, s[0:1]
; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt 0x0
@@ -940,6 +941,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg
;
; GFX1250-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1:
; GFX1250-NOUNALIGNED: ; %bb.0:
+; GFX1250-NOUNALIGNED-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NOUNALIGNED-NEXT: s_clause 0xb
; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s2, s[0:1], 0x1
; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s3, s[0:1], 0x3
@@ -1208,6 +1210,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg
;
; GFX1250-UNALIGNED-LABEL: s_load_constant_v3i32_align2:
; GFX1250-UNALIGNED: ; %bb.0:
+; GFX1250-UNALIGNED-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-UNALIGNED-NEXT: global_load_b96 v[0:2], v0, s[0:1]
; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt 0x0
@@ -1218,6 +1221,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg
;
; GFX1250-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2:
; GFX1250-NOUNALIGNED: ; %bb.0:
+; GFX1250-NOUNALIGNED-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NOUNALIGNED-NEXT: s_clause 0x5
; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s2, s[0:1], 0x2
; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s3, s[0:1], 0x6
@@ -1362,6 +1366,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg
;
; GFX1250-LABEL: s_load_constant_v3i32_align4:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mov_b32 s4, s0
; GFX1250-NEXT: s_mov_b32 s5, s1
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
@@ -1413,6 +1418,7 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) {
;
; GFX1250-LABEL: s_load_constant_i96_align8:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mov_b32 s4, s0
; GFX1250-NEXT: s_mov_b32 s5, s1
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
@@ -1464,6 +1470,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg
;
; GFX1250-LABEL: s_load_constant_v3i32_align8:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mov_b32 s4, s0
; GFX1250-NEXT: s_mov_b32 s5, s1
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
@@ -1515,6 +1522,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg
;
; GFX1250-LABEL: s_load_constant_v6i16_align8:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mov_b32 s4, s0
; GFX1250-NEXT: s_mov_b32 s5, s1
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
@@ -1593,6 +1601,7 @@ define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg
;
; GFX1250-LABEL: s_load_constant_v12i8_align8:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mov_b32 s4, s0
; GFX1250-NEXT: s_mov_b32 s5, s1
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
@@ -1670,11 +1679,24 @@ define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg
}
define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align16(ptr addrspace(4) inreg %ptr) {
-; GFX12-LABEL: s_load_constant_v3i32_align16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ; return to shader part epilog
+; GFX12-UNALIGNED-LABEL: s_load_constant_v3i32_align16:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT: ; return to shader part epilog
+;
+; GFX12-NOUNALIGNED-LABEL: s_load_constant_v3i32_align16:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: s_load_constant_v3i32_align16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ; return to shader part epilog
;
; GCN-LABEL: s_load_constant_v3i32_align16:
; GCN: ; %bb.0:
@@ -1684,3 +1706,5 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align16(ptr addrspace(4) inreg
%load = load <3 x i32>, ptr addrspace(4) %ptr, align 16
ret <3 x i32> %load
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant32bit.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant32bit.ll
index 0038a09..2e1b853 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant32bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant32bit.ll
@@ -11,8 +11,8 @@ define amdgpu_ps float @load_constant32bit_vgpr_offset(i32 %arg) {
; GFX6-LABEL: load_constant32bit_vgpr_offset:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: v_mov_b32_e32 v1, 0
+; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b64 s[0:1], 0
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
@@ -59,8 +59,8 @@ define amdgpu_ps <8 x float> @load_constant32bit_vgpr_v8f32(ptr addrspace(6) %ar
; GFX6-LABEL: load_constant32bit_vgpr_v8f32:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: v_mov_b32_e32 v4, v0
-; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: v_mov_b32_e32 v5, 0
+; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b64 s[0:1], 0
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll
index 4361e5c..27005e7aa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll
@@ -1070,9 +1070,6 @@ define amdgpu_ps void @load_divergent_P3_i16(ptr addrspace(3) inreg %ptra, ptr a
; GFX11-True16-NEXT: v_mov_b32_e32 v1, s0
; GFX11-True16-NEXT: ds_load_u16_d16 v1, v1
; GFX11-True16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v1
-; GFX11-True16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-True16-NEXT: v_mov_b16_e32 v1.l, s0
; GFX11-True16-NEXT: ds_store_b16 v0, v1
; GFX11-True16-NEXT: s_endpgm
;
@@ -1089,10 +1086,6 @@ define amdgpu_ps void @load_divergent_P3_i16(ptr addrspace(3) inreg %ptra, ptr a
; GFX12-True16-NEXT: v_mov_b32_e32 v1, s0
; GFX12-True16-NEXT: ds_load_u16_d16 v1, v1
; GFX12-True16-NEXT: s_wait_dscnt 0x0
-; GFX12-True16-NEXT: v_readfirstlane_b32 s0, v1
-; GFX12-True16-NEXT: s_wait_alu 0xf1ff
-; GFX12-True16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-True16-NEXT: v_mov_b16_e32 v1.l, s0
; GFX12-True16-NEXT: ds_store_b16 v0, v1
; GFX12-True16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform.ll
index bf36dea..9bf140c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform.ll
@@ -13,9 +13,6 @@ define amdgpu_ps void @load_uniform_P1_i16_gfx12(ptr addrspace(1) inreg %ptra, p
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: global_load_d16_b16 v2, v2, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_readfirstlane_b32 s0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b16_e32 v2.l, s0
; GFX11-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-NEXT: s_endpgm
;
@@ -312,9 +309,6 @@ define amdgpu_ps void @load_uniform_P4_i16_gfx12(ptr addrspace(4) inreg %ptra, p
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: global_load_d16_b16 v2, v2, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_readfirstlane_b32 s0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b16_e32 v2.l, s0
; GFX11-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-uniform-in-vgpr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-uniform-in-vgpr.ll
index f12ec4d..6b4008f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-uniform-in-vgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-uniform-in-vgpr.ll
@@ -299,7 +299,7 @@ define amdgpu_ps void @sextload_and_zextload_P3_i8(ptr addrspace(3) inreg %ptra,
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s1, v2
; GFX12-NEXT: s_add_co_i32 s0, s0, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v1, s0
; GFX12-NEXT: ds_store_b32 v0, v1
; GFX12-NEXT: s_endpgm
@@ -338,7 +338,7 @@ define amdgpu_ps void @sextload_and_zextload_P3_i16(ptr addrspace(3) inreg %ptra
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s1, v2
; GFX12-NEXT: s_add_co_i32 s0, s0, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v1, s0
; GFX12-NEXT: ds_store_b32 v0, v1
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
index e86f747..37b5422 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s
; Note: we use MIR test checks + stop after legalizer to prevent
; tests from being optimized out.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs-i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs-i64.ll
index 43c8f46..62b8b55 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs-i64.ll
@@ -132,6 +132,7 @@ define i64 @test_abs_i64(i64 %a) {
define amdgpu_ps i64 @test_umin_i64_s(i64 inreg %a, i64 inreg %b) {
; CHECK-LABEL: test_umin_i64_s:
; CHECK: ; %bb.0:
+; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; CHECK-NEXT: v_min_u64 v[0:1], s[0:1], s[2:3]
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -144,6 +145,7 @@ define amdgpu_ps i64 @test_umin_i64_s(i64 inreg %a, i64 inreg %b) {
define amdgpu_ps i64 @test_umax_i64_s(i64 inreg %a, i64 inreg %b) {
; CHECK-LABEL: test_umax_i64_s:
; CHECK: ; %bb.0:
+; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; CHECK-NEXT: v_max_u64 v[0:1], s[0:1], s[2:3]
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -156,6 +158,7 @@ define amdgpu_ps i64 @test_umax_i64_s(i64 inreg %a, i64 inreg %b) {
define amdgpu_ps i64 @test_smin_i64_s(i64 inreg %a, i64 inreg %b) {
; CHECK-LABEL: test_smin_i64_s:
; CHECK: ; %bb.0:
+; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; CHECK-NEXT: v_min_i64 v[0:1], s[0:1], s[2:3]
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -168,6 +171,7 @@ define amdgpu_ps i64 @test_smin_i64_s(i64 inreg %a, i64 inreg %b) {
define amdgpu_ps i64 @test_smax_i64_s(i64 inreg %a, i64 inreg %b) {
; CHECK-LABEL: test_smax_i64_s:
; CHECK: ; %bb.0:
+; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; CHECK-NEXT: v_max_i64 v[0:1], s[0:1], s[2:3]
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -180,6 +184,7 @@ define amdgpu_ps i64 @test_smax_i64_s(i64 inreg %a, i64 inreg %b) {
define amdgpu_ps i64 @test_abs_i64_s(i64 inreg %a) {
; CHECK-LABEL: test_abs_i64_s:
; CHECK: ; %bb.0:
+; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; CHECK-NEXT: s_ashr_i32 s2, s1, 31
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; CHECK-NEXT: s_mov_b32 s3, s2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll
index 44b12a9..61a6137 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s
declare void @readsMem(ptr) #0
declare void @writesMem(ptr) #1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
index 1cd9c0b..7bd1ff2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
@@ -8,17 +8,16 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac
; GFX10-LABEL: v_mul_i64_no_zext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[0:1]
-; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v0, v2, 0
-; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6]
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6]
-; GFX10-NEXT: v_mov_b32_e32 v5, v0
-; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[2:3]
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v2, v4, 0
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, v2, v5, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, v3, v4, v[1:2]
+; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i64_no_zext:
@@ -26,19 +25,17 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v9, 3, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b64 v[0:1], v9, s[0:1]
-; GFX11-NEXT: global_load_b64 v[2:3], v9, s[2:3]
+; GFX11-NEXT: global_load_b64 v[2:3], v8, s[0:1]
+; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v0, v2, 0
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v0, v3, v[5:6]
-; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v1, v2, v[6:7]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v5, v7
-; GFX11-NEXT: global_store_b64 v9, v[4:5], s[2:3]
+; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v2, v5, v[1:2]
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v3, v4, v[6:7]
+; GFX11-NEXT: global_store_b64 v8, v[0:1], s[2:3]
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
@@ -58,18 +55,16 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 2, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
-; GFX10-NEXT: global_load_dword v4, v3, s[6:7]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v1, s[2:3]
+; GFX10-NEXT: global_load_dword v4, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v0, v4, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v4, v[0:1]
-; GFX10-NEXT: v_mov_b32_e32 v3, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v2, v4, 0
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v3, v4, v[1:2]
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i64_zext_src1:
@@ -80,17 +75,16 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v1, s[2:3]
-; GFX11-NEXT: global_load_b32 v5, v2, s[4:5]
+; GFX11-NEXT: global_load_b64 v[2:3], v1, s[2:3]
+; GFX11-NEXT: global_load_b32 v6, v0, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v5, 0
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v6, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v1, v5, v[0:1]
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1]
+; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v3, v6, v[1:2]
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, v4
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
@@ -110,18 +104,16 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v2, s[2:3]
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7]
+; GFX10-NEXT: global_load_dword v4, v1, s[2:3]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v4, v0, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1]
-; GFX10-NEXT: v_mov_b32_e32 v3, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v2, 0
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v4, v3, v[1:2]
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i64_zext_src0:
@@ -134,15 +126,14 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v5, v1, s[2:3]
-; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v6, v1, s[2:3]
+; GFX11-NEXT: global_load_b64 v[2:3], v0, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1]
+; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v6, v3, v[1:2]
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, v4
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.a = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid
@@ -165,10 +156,10 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v3, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v2, 0
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v2, v3, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
@@ -179,15 +170,15 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT: global_load_b32 v0, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v3, v0, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v1, v0, 0
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, 0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -209,18 +200,16 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dword v4, v2, s[2:3]
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX10-NEXT: global_load_dword v4, v0, s[2:3]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v4, v0, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1]
-; GFX10-NEXT: v_mov_b32_e32 v3, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v2, 0
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v4, v3, v[1:2]
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i64_masked_src0_hi:
@@ -233,15 +222,14 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b32 v5, v0, s[2:3]
-; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v6, v0, s[2:3]
+; GFX11-NEXT: global_load_b64 v[2:3], v0, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1]
+; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v6, v3, v[1:2]
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, v4
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
@@ -389,22 +377,20 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
-; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
+; GFX10-NEXT: global_load_dwordx2 v[1:2], v0, s[2:3]
+; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_and_b32_e32 v6, 0xfff00000, v0
+; GFX10-NEXT: v_and_b32_e32 v5, 0xfff00000, v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mad_u64_u32 v[4:5], s2, v6, v2, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, v5
-; GFX10-NEXT: v_mad_u64_u32 v[5:6], s2, v6, v3, v[0:1]
-; GFX10-NEXT: v_and_b32_e32 v0, 0xf00f, v1
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v0, v2, v[5:6]
-; GFX10-NEXT: v_mov_b32_e32 v5, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: global_store_dwordx2 v0, v[4:5], s[0:1]
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v5, v3, 0
+; GFX10-NEXT: v_mad_u64_u32 v[4:5], s2, v5, v4, v[1:2]
+; GFX10-NEXT: v_and_b32_e32 v1, 0xf00f, v2
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v1, v3, v[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i64_partially_masked_src0:
@@ -414,24 +400,22 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
-; GFX11-NEXT: global_load_b64 v[2:3], v2, s[4:5]
+; GFX11-NEXT: global_load_b64 v[1:2], v0, s[2:3]
+; GFX11-NEXT: global_load_b64 v[3:4], v0, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v0
+; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v7, v2, 0
-; GFX11-NEXT: v_mov_b32_e32 v0, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v7, v3, v[0:1]
-; GFX11-NEXT: v_and_b32_e32 v3, 0xf00f, v1
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[5:6]
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v7, v3, 0
+; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v7, v4, v[1:2]
+; GFX11-NEXT: v_and_b32_e32 v4, 0xf00f, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v0, 0
-; GFX11-NEXT: global_store_b64 v0, v[4:5], s[0:1]
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v4, v3, v[5:6]
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 637aaf7..1462b59 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -26,6 +26,7 @@ define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
;
; GFX1250-LABEL: s_mul_i16:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mul_i32 s0, s0, s1
; GFX1250-NEXT: ; return to shader part epilog
%result = mul i16 %num, %den
@@ -125,6 +126,7 @@ define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inre
;
; GFX1250-LABEL: s_mul_i16_zeroext:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mul_i32 s0, s0, s1
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_and_b32 s0, 0xffff, s0
@@ -220,6 +222,7 @@ define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inre
;
; GFX1250-LABEL: s_mul_i16_signext:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mul_i32 s0, s0, s1
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_sext_i32_i16 s0, s0
@@ -315,6 +318,7 @@ define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) {
;
; GFX1250-LABEL: s_mul_i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mul_i32 s0, s0, s1
; GFX1250-NEXT: ; return to shader part epilog
%result = mul i32 %num, %den
@@ -375,6 +379,7 @@ define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %d
;
; GFX1250-LABEL: s_mul_v2i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mul_i32 s0, s0, s2
; GFX1250-NEXT: s_mul_i32 s1, s1, s3
; GFX1250-NEXT: ; return to shader part epilog
@@ -474,6 +479,7 @@ define amdgpu_cs i33 @s_mul_i33(i33 inreg %num, i33 inreg %den) {
;
; GFX1250-LABEL: s_mul_i33:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3]
; GFX1250-NEXT: ; return to shader part epilog
%result = mul i33 %num, %den
@@ -535,6 +541,7 @@ define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) {
;
; GFX1250-LABEL: s_mul_i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3]
; GFX1250-NEXT: ; return to shader part epilog
%result = mul i64 %num, %den
@@ -546,10 +553,11 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v4, v0
-; GCN-NEXT: v_mov_b32_e32 v5, v1
-; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
-; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GCN-NEXT: v_mov_b32_e32 v5, v2
+; GCN-NEXT: v_mov_b32_e32 v6, v1
+; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0
+; GCN-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2]
+; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i64:
@@ -711,6 +719,7 @@ define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
;
; GFX1250-LABEL: s_mul_i96:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mul_i32 s6, s0, s5
; GFX1250-NEXT: s_mul_i32 s7, s1, s4
; GFX1250-NEXT: s_mul_i32 s2, s2, s3
@@ -740,12 +749,13 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GCN-NEXT: v_mov_b32_e32 v6, v0
; GCN-NEXT: v_mov_b32_e32 v7, v1
; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
-; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v4, v[0:1]
-; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0
-; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v3, v[8:9]
-; GCN-NEXT: v_mov_b32_e32 v2, v8
-; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v4, v[1:2]
-; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[1:2]
+; GCN-NEXT: v_mov_b32_e32 v8, v2
+; GCN-NEXT: v_mov_b32_e32 v9, v3
+; GCN-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v7, v4, v[0:1]
+; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0
+; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v9, v[10:11]
+; GCN-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v4, v[1:2]
+; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v9, v[10:11]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i96:
@@ -753,26 +763,26 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v0
; GFX10-NEXT: v_mov_b32_e32 v7, v1
+; GFX10-NEXT: v_mov_b32_e32 v8, v3
; GFX10-NEXT: v_mul_lo_u32 v0, v6, v5
-; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v7, v4, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v6, v3, 0
-; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v2, v3, v[8:9]
-; GFX10-NEXT: v_mov_b32_e32 v2, v8
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v6, v4, v[1:2]
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v3, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[9:10], s4, v7, v4, v[0:1]
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v6, v8, 0
+; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v2, v8, v[9:10]
+; GFX10-NEXT: v_mad_u64_u32 v[9:10], s4, v6, v4, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v8, v[9:10]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_mul_i96:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
+; GFX11-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v9, v3
; GFX11-NEXT: v_mul_lo_u32 v0, v6, v5
-; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v7, v4, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v3, 0
-; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v2, v3, v[8:9]
-; GFX11-NEXT: v_mov_b32_e32 v2, v9
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[1:2]
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[1:2]
+; GFX11-NEXT: v_mad_u64_u32 v[10:11], null, v7, v4, v[0:1]
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v9, 0
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v8, v9, v[10:11]
+; GFX11-NEXT: v_mad_u64_u32 v[10:11], null, v6, v4, v[1:2]
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v9, v[10:11]
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_mul_i96:
@@ -783,16 +793,16 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_mul_lo_u32 v0, v6, v5
-; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v7, v4, v[0:1]
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v3, 0
+; GFX12-NEXT: v_mov_b32_e32 v8, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v3, v[8:9]
-; GFX12-NEXT: v_mov_b32_e32 v2, v8
+; GFX12-NEXT: v_mul_lo_u32 v0, v6, v5
+; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v7, v4, v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v8, 0
+; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v2, v8, v[9:10]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v6, v4, v[1:2]
-; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v3, v[1:2]
+; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v6, v4, v[1:2]
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v8, v[9:10]
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_mul_i96:
@@ -808,10 +818,10 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX1250-NEXT: v_mad_u32 v9, v2, v3, v5
; GFX1250-NEXT: v_mov_b32_e32 v8, v1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5], v6, v4, v[8:9]
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], v7, v3, v[4:5]
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v6, v4, v[8:9]
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5], v7, v3, v[10:11]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3
+; GFX1250-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul i96 %num, %den
ret i96 %result
@@ -1025,6 +1035,7 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
;
; GFX1250-LABEL: s_mul_i128:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mul_i32 s9, s0, s6
; GFX1250-NEXT: s_mul_i32 s11, s1, s5
; GFX1250-NEXT: s_mul_hi_u32 s10, s0, s6
@@ -1071,18 +1082,19 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX7-NEXT: v_mov_b32_e32 v9, v1
; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
; GFX7-NEXT: v_mov_b32_e32 v10, v2
-; GFX7-NEXT: v_mul_lo_u32 v7, v8, v7
-; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
-; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
-; GFX7-NEXT: v_mul_lo_u32 v6, v9, v6
-; GFX7-NEXT: v_mov_b32_e32 v2, v11
-; GFX7-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
-; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
-; GFX7-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
-; GFX7-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
-; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
+; GFX7-NEXT: v_mov_b32_e32 v12, v4
+; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v9, v5, v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v11, v3
+; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0
+; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v12, v[13:14]
+; GFX7-NEXT: v_mul_lo_u32 v4, v9, v6
+; GFX7-NEXT: v_mul_lo_u32 v6, v8, v7
+; GFX7-NEXT: v_mad_u64_u32 v[13:14], vcc, v8, v5, v[1:2]
+; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[13:14]
+; GFX7-NEXT: v_addc_u32_e64 v3, s[4:5], v3, v6, s[4:5]
+; GFX7-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4]
+; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i128:
@@ -1092,18 +1104,19 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX8-NEXT: v_mov_b32_e32 v9, v1
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
; GFX8-NEXT: v_mov_b32_e32 v10, v2
-; GFX8-NEXT: v_mul_lo_u32 v7, v8, v7
-; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
-; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
-; GFX8-NEXT: v_mul_lo_u32 v6, v9, v6
-; GFX8-NEXT: v_mov_b32_e32 v2, v11
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
-; GFX8-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
-; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
+; GFX8-NEXT: v_mov_b32_e32 v12, v4
+; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v9, v5, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v11, v3
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v12, v[13:14]
+; GFX8-NEXT: v_mul_lo_u32 v4, v9, v6
+; GFX8-NEXT: v_mul_lo_u32 v6, v8, v7
+; GFX8-NEXT: v_mad_u64_u32 v[13:14], vcc, v8, v5, v[1:2]
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[13:14]
+; GFX8-NEXT: v_addc_u32_e64 v3, s[4:5], v3, v6, s[4:5]
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4]
+; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i128:
@@ -1113,18 +1126,19 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX9-NEXT: v_mov_b32_e32 v9, v1
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
; GFX9-NEXT: v_mov_b32_e32 v10, v2
-; GFX9-NEXT: v_mul_lo_u32 v7, v8, v7
-; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
-; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
-; GFX9-NEXT: v_mul_lo_u32 v6, v9, v6
-; GFX9-NEXT: v_mov_b32_e32 v2, v11
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
-; GFX9-NEXT: v_addc_co_u32_e64 v7, s[4:5], v12, v7, s[4:5]
-; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v6, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
+; GFX9-NEXT: v_mov_b32_e32 v12, v4
+; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v9, v5, v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v11, v3
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v12, v[13:14]
+; GFX9-NEXT: v_mul_lo_u32 v4, v9, v6
+; GFX9-NEXT: v_mul_lo_u32 v6, v8, v7
+; GFX9-NEXT: v_mad_u64_u32 v[13:14], vcc, v8, v5, v[1:2]
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[13:14]
+; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], v3, v6, s[4:5]
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4]
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i128:
@@ -1133,19 +1147,19 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX10-NEXT: v_mov_b32_e32 v8, v0
; GFX10-NEXT: v_mov_b32_e32 v9, v1
; GFX10-NEXT: v_mov_b32_e32 v10, v2
+; GFX10-NEXT: v_mov_b32_e32 v11, v3
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v6, 0
; GFX10-NEXT: v_mul_lo_u32 v7, v8, v7
; GFX10-NEXT: v_mul_lo_u32 v6, v9, v6
-; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v9, v5, v[0:1]
+; GFX10-NEXT: v_mad_u64_u32 v[12:13], s4, v9, v5, v[0:1]
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v4, 0
-; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v10, v4, v[11:12]
-; GFX10-NEXT: v_mov_b32_e32 v2, v11
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[1:2]
-; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v12, v7, s4
-; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo
-; GFX10-NEXT: v_mad_u64_u32 v[5:6], s4, v10, v5, v[6:7]
-; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v4, v[5:6]
+; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v10, v4, v[12:13]
+; GFX10-NEXT: v_mad_u64_u32 v[12:13], vcc_lo, v8, v5, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[12:13]
+; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s4, v3, v7, s4
+; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX10-NEXT: v_mad_u64_u32 v[5:6], s4, v10, v5, v[3:4]
+; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v11, v4, v[5:6]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_mul_i128:
@@ -1157,11 +1171,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v6, 0
; GFX11-NEXT: v_mul_lo_u32 v4, v9, v6
; GFX11-NEXT: v_mul_lo_u32 v6, v8, v7
-; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v9, v5, v[0:1]
+; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v9, v5, v[0:1]
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v11, 0
-; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v10, v11, v[2:3]
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v11, v[1:2]
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v10, v11, v[13:14]
+; GFX11-NEXT: v_mad_u64_u32 v[13:14], vcc_lo, v8, v5, v[1:2]
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v11, v[13:14]
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, s0
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v4, vcc_lo
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v10, v5, v[3:4]
@@ -1176,28 +1190,26 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
-; GFX12-NEXT: v_mov_b32_e32 v10, v2
+; GFX12-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v6, 0
; GFX12-NEXT: v_mul_lo_u32 v7, v8, v7
; GFX12-NEXT: v_mul_lo_u32 v6, v9, v6
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v9, v5, v[0:1]
+; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], null, v9, v5, v[0:1]
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v4, 0
-; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v10, v4, v[11:12]
+; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v10, v4, v[12:13]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v2, v11
-; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2]
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v12, v7, s0
-; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], vcc_lo, v8, v5, v[1:2]
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[12:13]
+; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s0
+; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, vcc_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, v7, v6, vcc_lo
-; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[6:7]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v3, v4, v[5:6]
+; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[3:4]
+; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v11, v4, v[5:6]
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_mul_i128:
@@ -1210,16 +1222,16 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v9, v5, v[0:1]
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v8, v4, 0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v2, v4, v[10:11]
-; GFX1250-NEXT: v_mov_b32_e32 v12, v1
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[12:13], v2, v4, v[10:11]
+; GFX1250-NEXT: v_mov_b32_e32 v10, v1
; GFX1250-NEXT: v_mul_lo_u32 v1, v9, v6
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mov_b32_e32 v13, v10
-; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], vcc_lo, v8, v5, v[12:13]
+; GFX1250-NEXT: v_mov_b32_e32 v11, v12
+; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], vcc_lo, v8, v5, v[10:11]
; GFX1250-NEXT: v_mul_lo_u32 v8, v8, v7
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[12:13]
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v11, v8, s0
+; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[14:15]
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v13, v8, s0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v8, v1, vcc_lo
; GFX1250-NEXT: v_mad_u32 v1, v2, v5, v1
@@ -2215,6 +2227,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
;
; GFX1250-LABEL: s_mul_i256:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mul_i32 s17, s0, s10
; GFX1250-NEXT: s_mul_i32 s19, s1, s9
; GFX1250-NEXT: s_mul_hi_u32 s18, s0, s10
@@ -2401,207 +2414,201 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
-; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
-; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0
-; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17]
-; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
-; GFX7-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX7-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
-; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21]
-; GFX7-NEXT: v_addc_u32_e32 v23, vcc, 0, v22, vcc
-; GFX7-NEXT: v_mov_b32_e32 v22, v18
-; GFX7-NEXT: v_mov_b32_e32 v18, v19
-; GFX7-NEXT: v_mov_b32_e32 v19, v16
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
-; GFX7-NEXT: v_mul_lo_u32 v16, v6, v9
-; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7]
-; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21]
-; GFX7-NEXT: v_addc_u32_e64 v24, s[4:5], 0, v6, s[4:5]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
-; GFX7-NEXT: v_mul_lo_u32 v26, v4, v11
-; GFX7-NEXT: v_mul_lo_u32 v27, v3, v12
-; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22]
-; GFX7-NEXT: v_mul_lo_u32 v25, v5, v10
-; GFX7-NEXT: v_mul_lo_u32 v28, v2, v13
-; GFX7-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12]
-; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
-; GFX7-NEXT: v_addc_u32_e64 v22, s[10:11], 0, v6, s[10:11]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19]
-; GFX7-NEXT: v_mov_b32_e32 v21, v20
-; GFX7-NEXT: v_mov_b32_e32 v20, v11
-; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
-; GFX7-NEXT: v_addc_u32_e64 v2, s[10:11], 0, v22, s[10:11]
-; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
-; GFX7-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v2, s[10:11]
-; GFX7-NEXT: v_mul_lo_u32 v9, v1, v14
-; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17]
-; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21]
-; GFX7-NEXT: v_addc_u32_e64 v3, s[10:11], v12, v3, s[10:11]
+; GFX7-NEXT: v_mul_lo_u32 v28, v4, v11
+; GFX7-NEXT: v_mul_lo_u32 v29, v3, v12
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17]
+; GFX7-NEXT: v_mul_lo_u32 v30, v2, v13
+; GFX7-NEXT: v_mul_lo_u32 v27, v5, v10
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19]
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v3, v11, v[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v10, 0
+; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v4, v10, v[18:19]
+; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v1, v9, v[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[20:21]
+; GFX7-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[4:5]
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[22:23]
+; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v6, v8, v[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0
+; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v1, v11, v[16:17]
+; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v10, v[20:21]
+; GFX7-NEXT: v_addc_u32_e64 v20, s[4:5], 0, v24, s[4:5]
+; GFX7-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v3, v9, v[16:17]
+; GFX7-NEXT: v_addc_u32_e64 v16, s[4:5], 0, v20, s[4:5]
+; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v4, v8, v[24:25]
+; GFX7-NEXT: v_addc_u32_e64 v24, s[4:5], 0, v16, s[4:5]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v13, v[21:22]
+; GFX7-NEXT: v_mul_lo_u32 v25, v6, v9
+; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v1, v12, v[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v2, v11, v[21:22]
+; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v3, v10, v[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[14:15], v0, v11, v[19:20]
+; GFX7-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[21:22]
+; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v26, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v1, v10, v[16:17]
+; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
+; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[10:11], vcc, v2, v9, v[19:20]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0
+; GFX7-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v8, v[10:11]
+; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[14:15], v5, v8, v[12:13]
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v2, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[16:17], v0, v9, v[17:18]
+; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[16:17]
; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15
-; GFX7-NEXT: v_addc_u32_e64 v4, s[10:11], v24, v4, s[10:11]
-; GFX7-NEXT: v_addc_u32_e64 v5, s[10:11], v11, v5, s[10:11]
-; GFX7-NEXT: v_addc_u32_e64 v6, s[10:11], v23, v6, s[10:11]
-; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v17, v0, s[10:11]
-; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15]
-; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v28, s[12:13]
-; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v27, s[8:9]
-; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7]
-; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5]
-; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
-; GFX7-NEXT: v_mov_b32_e32 v0, v10
+; GFX7-NEXT: v_mad_u64_u32 v[11:12], vcc, v1, v8, v[2:3]
+; GFX7-NEXT: v_addc_u32_e32 v3, vcc, v6, v19, vcc
+; GFX7-NEXT: v_mul_lo_u32 v10, v1, v14
+; GFX7-NEXT: v_addc_u32_e32 v4, vcc, v4, v20, vcc
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, v5, v21, vcc
+; GFX7-NEXT: v_addc_u32_e32 v6, vcc, v24, v22, vcc
+; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v23, v0, vcc
+; GFX7-NEXT: v_addc_u32_e64 v0, vcc, v0, v10, s[14:15]
+; GFX7-NEXT: v_addc_u32_e64 v0, vcc, v0, v30, s[12:13]
+; GFX7-NEXT: v_addc_u32_e64 v0, vcc, v0, v29, s[10:11]
+; GFX7-NEXT: v_addc_u32_e64 v0, vcc, v0, v28, s[8:9]
+; GFX7-NEXT: v_addc_u32_e64 v0, vcc, v0, v27, s[6:7]
+; GFX7-NEXT: v_addc_u32_e64 v0, vcc, v0, v25, s[4:5]
+; GFX7-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, v16
+; GFX7-NEXT: v_mov_b32_e32 v1, v11
+; GFX7-NEXT: v_mov_b32_e32 v2, v12
+; GFX7-NEXT: v_mov_b32_e32 v7, v9
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i256:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
-; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
-; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0
-; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17]
-; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
-; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
-; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21]
-; GFX8-NEXT: v_addc_u32_e32 v23, vcc, 0, v22, vcc
-; GFX8-NEXT: v_mov_b32_e32 v22, v18
-; GFX8-NEXT: v_mov_b32_e32 v18, v19
-; GFX8-NEXT: v_mov_b32_e32 v19, v16
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
-; GFX8-NEXT: v_mul_lo_u32 v16, v6, v9
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7]
-; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21]
-; GFX8-NEXT: v_addc_u32_e64 v24, s[4:5], 0, v6, s[4:5]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
-; GFX8-NEXT: v_mul_lo_u32 v26, v4, v11
-; GFX8-NEXT: v_mul_lo_u32 v27, v3, v12
-; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22]
-; GFX8-NEXT: v_mul_lo_u32 v25, v5, v10
-; GFX8-NEXT: v_mul_lo_u32 v28, v2, v13
-; GFX8-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12]
-; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
-; GFX8-NEXT: v_addc_u32_e64 v22, s[10:11], 0, v6, s[10:11]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19]
-; GFX8-NEXT: v_mov_b32_e32 v21, v20
-; GFX8-NEXT: v_mov_b32_e32 v20, v11
-; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
-; GFX8-NEXT: v_addc_u32_e64 v2, s[10:11], 0, v22, s[10:11]
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
-; GFX8-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v2, s[10:11]
-; GFX8-NEXT: v_mul_lo_u32 v9, v1, v14
-; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17]
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21]
-; GFX8-NEXT: v_addc_u32_e64 v3, s[10:11], v12, v3, s[10:11]
+; GFX8-NEXT: v_mul_lo_u32 v28, v4, v11
+; GFX8-NEXT: v_mul_lo_u32 v29, v3, v12
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17]
+; GFX8-NEXT: v_mul_lo_u32 v30, v2, v13
+; GFX8-NEXT: v_mul_lo_u32 v27, v5, v10
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19]
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v3, v11, v[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v10, 0
+; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v4, v10, v[18:19]
+; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v1, v9, v[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[20:21]
+; GFX8-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[4:5]
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[22:23]
+; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v6, v8, v[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0
+; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v1, v11, v[16:17]
+; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v10, v[20:21]
+; GFX8-NEXT: v_addc_u32_e64 v20, s[4:5], 0, v24, s[4:5]
+; GFX8-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v3, v9, v[16:17]
+; GFX8-NEXT: v_addc_u32_e64 v16, s[4:5], 0, v20, s[4:5]
+; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v4, v8, v[24:25]
+; GFX8-NEXT: v_addc_u32_e64 v24, s[4:5], 0, v16, s[4:5]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v13, v[21:22]
+; GFX8-NEXT: v_mul_lo_u32 v25, v6, v9
+; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v1, v12, v[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v2, v11, v[21:22]
+; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v3, v10, v[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[14:15], v0, v11, v[19:20]
+; GFX8-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[21:22]
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v26, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v1, v10, v[16:17]
+; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
+; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[10:11], vcc, v2, v9, v[19:20]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0
+; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v8, v[10:11]
+; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[14:15], v5, v8, v[12:13]
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v2, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[16:17], v0, v9, v[17:18]
+; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[16:17]
; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15
-; GFX8-NEXT: v_addc_u32_e64 v4, s[10:11], v24, v4, s[10:11]
-; GFX8-NEXT: v_addc_u32_e64 v5, s[10:11], v11, v5, s[10:11]
-; GFX8-NEXT: v_addc_u32_e64 v6, s[10:11], v23, v6, s[10:11]
-; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v17, v0, s[10:11]
-; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15]
-; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v28, s[12:13]
-; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v27, s[8:9]
-; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7]
-; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5]
-; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, v10
+; GFX8-NEXT: v_mad_u64_u32 v[11:12], vcc, v1, v8, v[2:3]
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v6, v19, vcc
+; GFX8-NEXT: v_mul_lo_u32 v10, v1, v14
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v20, vcc
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v21, vcc
+; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v24, v22, vcc
+; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v23, v0, vcc
+; GFX8-NEXT: v_addc_u32_e64 v0, vcc, v0, v10, s[14:15]
+; GFX8-NEXT: v_addc_u32_e64 v0, vcc, v0, v30, s[12:13]
+; GFX8-NEXT: v_addc_u32_e64 v0, vcc, v0, v29, s[10:11]
+; GFX8-NEXT: v_addc_u32_e64 v0, vcc, v0, v28, s[8:9]
+; GFX8-NEXT: v_addc_u32_e64 v0, vcc, v0, v27, s[6:7]
+; GFX8-NEXT: v_addc_u32_e64 v0, vcc, v0, v25, s[4:5]
+; GFX8-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, v16
+; GFX8-NEXT: v_mov_b32_e32 v1, v11
+; GFX8-NEXT: v_mov_b32_e32 v2, v12
+; GFX8-NEXT: v_mov_b32_e32 v7, v9
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i256:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
-; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
-; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0
-; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17]
-; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
-; GFX9-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v22, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX9-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v22, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
-; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21]
-; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v22, vcc
-; GFX9-NEXT: v_mov_b32_e32 v22, v18
-; GFX9-NEXT: v_mov_b32_e32 v18, v19
-; GFX9-NEXT: v_mov_b32_e32 v19, v16
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
-; GFX9-NEXT: v_mul_lo_u32 v16, v6, v9
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7]
-; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21]
-; GFX9-NEXT: v_addc_co_u32_e64 v24, s[4:5], 0, v6, s[4:5]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
-; GFX9-NEXT: v_mul_lo_u32 v26, v4, v11
-; GFX9-NEXT: v_mul_lo_u32 v27, v3, v12
-; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22]
-; GFX9-NEXT: v_mul_lo_u32 v25, v5, v10
-; GFX9-NEXT: v_mul_lo_u32 v28, v2, v13
-; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12]
-; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
-; GFX9-NEXT: v_addc_co_u32_e64 v22, s[10:11], 0, v6, s[10:11]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19]
-; GFX9-NEXT: v_mov_b32_e32 v21, v20
-; GFX9-NEXT: v_mov_b32_e32 v20, v11
-; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
-; GFX9-NEXT: v_addc_co_u32_e64 v2, s[10:11], 0, v22, s[10:11]
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
-; GFX9-NEXT: v_addc_co_u32_e64 v11, s[10:11], 0, v2, s[10:11]
-; GFX9-NEXT: v_mul_lo_u32 v9, v1, v14
-; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17]
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21]
-; GFX9-NEXT: v_addc_co_u32_e64 v3, s[10:11], v12, v3, s[10:11]
+; GFX9-NEXT: v_mul_lo_u32 v28, v4, v11
+; GFX9-NEXT: v_mul_lo_u32 v29, v3, v12
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17]
+; GFX9-NEXT: v_mul_lo_u32 v30, v2, v13
+; GFX9-NEXT: v_mul_lo_u32 v27, v5, v10
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19]
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v3, v11, v[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v10, 0
+; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v4, v10, v[18:19]
+; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v1, v9, v[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[20:21]
+; GFX9-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[4:5]
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[22:23]
+; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v6, v8, v[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0
+; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v1, v11, v[16:17]
+; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v10, v[20:21]
+; GFX9-NEXT: v_addc_co_u32_e64 v20, s[4:5], 0, v24, s[4:5]
+; GFX9-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v3, v9, v[16:17]
+; GFX9-NEXT: v_addc_co_u32_e64 v16, s[4:5], 0, v20, s[4:5]
+; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v4, v8, v[24:25]
+; GFX9-NEXT: v_addc_co_u32_e64 v24, s[4:5], 0, v16, s[4:5]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v13, v[21:22]
+; GFX9-NEXT: v_mul_lo_u32 v25, v6, v9
+; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v1, v12, v[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v2, v11, v[21:22]
+; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v3, v10, v[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[14:15], v0, v11, v[19:20]
+; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[21:22]
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v26, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v1, v10, v[16:17]
+; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
+; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[10:11], vcc, v2, v9, v[19:20]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0
+; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v6, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v8, v[10:11]
+; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[14:15], v5, v8, v[12:13]
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v2, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[16:17], v0, v9, v[17:18]
+; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[16:17]
; GFX9-NEXT: v_mul_lo_u32 v0, v0, v15
-; GFX9-NEXT: v_addc_co_u32_e64 v4, s[10:11], v24, v4, s[10:11]
-; GFX9-NEXT: v_addc_co_u32_e64 v5, s[10:11], v11, v5, s[10:11]
-; GFX9-NEXT: v_addc_co_u32_e64 v6, s[10:11], v23, v6, s[10:11]
-; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v17, v0, s[10:11]
-; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v9, s[14:15]
-; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v28, s[12:13]
-; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v27, s[8:9]
-; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v26, s[6:7]
-; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v25, s[4:5]
-; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v16, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, v10
+; GFX9-NEXT: v_mad_u64_u32 v[11:12], vcc, v1, v8, v[2:3]
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v19, vcc
+; GFX9-NEXT: v_mul_lo_u32 v10, v1, v14
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v20, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v21, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v24, v22, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v23, v0, vcc
+; GFX9-NEXT: v_addc_co_u32_e64 v0, vcc, v0, v10, s[14:15]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, vcc, v0, v30, s[12:13]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, vcc, v0, v29, s[10:11]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, vcc, v0, v28, s[8:9]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, vcc, v0, v27, s[6:7]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, vcc, v0, v25, s[4:5]
+; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, v16
+; GFX9-NEXT: v_mov_b32_e32 v1, v11
+; GFX9-NEXT: v_mov_b32_e32 v2, v12
+; GFX9-NEXT: v_mov_b32_e32 v7, v9
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i256:
@@ -2609,139 +2616,138 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v16, v0
; GFX10-NEXT: v_mov_b32_e32 v17, v1
-; GFX10-NEXT: v_mul_lo_u32 v27, v6, v9
-; GFX10-NEXT: v_mul_lo_u32 v28, v5, v10
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v14, 0
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v16, v12, 0
-; GFX10-NEXT: v_mul_lo_u32 v30, v17, v14
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v13, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v12, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v11, v[18:19]
-; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s4
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v3, v11, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
-; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
-; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v16, v10, 0
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v10, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX10-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v5, v9, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
-; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
-; GFX10-NEXT: v_mad_u64_u32 v[22:23], s4, v6, v8, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[20:21]
-; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
-; GFX10-NEXT: v_mov_b32_e32 v20, v22
-; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
-; GFX10-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v13, v[19:20]
-; GFX10-NEXT: v_mov_b32_e32 v20, v18
-; GFX10-NEXT: v_mov_b32_e32 v19, v22
-; GFX10-NEXT: v_mul_lo_u32 v22, v16, v15
-; GFX10-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v16, v11, v[19:20]
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v16, v8, 0
-; GFX10-NEXT: v_mul_lo_u32 v20, v4, v11
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], s5, v2, v11, v[24:25]
-; GFX10-NEXT: v_mul_lo_u32 v25, v3, v12
-; GFX10-NEXT: v_mad_u64_u32 v[11:12], s6, v17, v10, v[14:15]
-; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6
-; GFX10-NEXT: v_mul_lo_u32 v24, v2, v13
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], s7, v3, v10, v[18:19]
-; GFX10-NEXT: v_mov_b32_e32 v13, v1
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s6, v2, v9, v[11:12]
-; GFX10-NEXT: v_mov_b32_e32 v14, v21
-; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6
-; GFX10-NEXT: v_mad_u64_u32 v[10:11], s6, v4, v9, v[18:19]
-; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v16, v9, v[13:14]
+; GFX10-NEXT: v_mov_b32_e32 v18, v2
+; GFX10-NEXT: v_mov_b32_e32 v19, v3
+; GFX10-NEXT: v_mov_b32_e32 v20, v4
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v16, v14, 0
+; GFX10-NEXT: v_mov_b32_e32 v21, v5
+; GFX10-NEXT: v_mov_b32_e32 v0, v6
+; GFX10-NEXT: v_mov_b32_e32 v22, v7
+; GFX10-NEXT: v_mul_lo_u32 v31, v17, v14
+; GFX10-NEXT: v_mul_lo_u32 v29, v20, v11
+; GFX10-NEXT: v_mul_lo_u32 v30, v16, v15
+; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v17, v13, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v16, v12, 0
+; GFX10-NEXT: v_mul_lo_u32 v27, v0, v9
+; GFX10-NEXT: v_mad_u64_u32 v[5:6], s4, v18, v12, v[3:4]
+; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v17, v11, v[1:2]
+; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s4
+; GFX10-NEXT: v_mad_u64_u32 v[25:26], s4, v16, v10, 0
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s5, v19, v11, v[5:6]
+; GFX10-NEXT: v_mad_u64_u32 v[5:6], vcc_lo, v18, v10, v[3:4]
+; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
+; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v20, v10, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v19, v9, v[5:6]
+; GFX10-NEXT: v_mad_u64_u32 v[23:24], s4, v21, v9, v[3:4]
+; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v7, vcc_lo
+; GFX10-NEXT: v_mad_u64_u32 v[4:5], vcc_lo, v20, v8, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[6:7], s4, v0, v8, v[23:24]
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[25:26]
+; GFX10-NEXT: v_add_co_ci_u32_e32 v25, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4
+; GFX10-NEXT: v_mul_lo_u32 v26, v21, v10
+; GFX10-NEXT: v_mad_u64_u32 v[23:24], vcc_lo, v16, v13, v[5:6]
+; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v18, v8, v[0:1]
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s6, v16, v8, 0
+; GFX10-NEXT: v_add_co_ci_u32_e64 v28, s4, 0, v28, s4
+; GFX10-NEXT: v_mad_u64_u32 v[5:6], s5, v17, v12, v[23:24]
+; GFX10-NEXT: v_mad_u64_u32 v[23:24], s6, v16, v11, v[3:4]
+; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s6
+; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v18, v11, v[5:6]
+; GFX10-NEXT: v_mad_u64_u32 v[5:6], s6, v17, v10, v[23:24]
+; GFX10-NEXT: v_mul_lo_u32 v23, v19, v12
+; GFX10-NEXT: v_mul_lo_u32 v24, v18, v13
+; GFX10-NEXT: v_mad_u64_u32 v[11:12], s7, v19, v10, v[3:4]
+; GFX10-NEXT: v_add_co_ci_u32_e64 v10, s6, 0, v14, s6
+; GFX10-NEXT: v_mad_u64_u32 v[3:4], s6, v18, v9, v[5:6]
+; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s6, 0, v10, s6
+; GFX10-NEXT: v_mad_u64_u32 v[5:6], s6, v20, v9, v[11:12]
+; GFX10-NEXT: v_mad_u64_u32 v[10:11], s8, v16, v9, v[1:2]
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s8
-; GFX10-NEXT: v_mad_u64_u32 v[3:4], s8, v3, v8, v[1:2]
-; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s8, 0, v6, s8
-; GFX10-NEXT: v_mad_u64_u32 v[5:6], s8, v5, v8, v[10:11]
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[12:13]
-; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v9, v3, s9
-; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v29, v4, s9
-; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v14, v5, s9
-; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v26, v6, s9
-; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v23, v22, s9
-; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v30, s8
-; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v24, s6
-; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v25, s7
-; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v20, s5
-; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
-; GFX10-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s4
-; GFX10-NEXT: v_mad_u64_u32 v[7:8], s4, v7, v8, v[9:10]
+; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v19, v8, v[3:4]
+; GFX10-NEXT: v_add_co_ci_u32_e64 v16, s8, 0, v14, s8
+; GFX10-NEXT: v_mad_u64_u32 v[14:15], s8, v21, v8, v[5:6]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[10:11]
+; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v9, v12, s9
+; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v28, v13, s9
+; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v16, v14, s9
+; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v25, v15, s9
+; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s9, v7, v30, s9
+; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s8, v7, v31, s8
+; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s6, v7, v24, s6
+; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s6, v7, v23, s7
+; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v7, v29, s4
+; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v7, v26, s5
+; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v27, vcc_lo
+; GFX10-NEXT: v_mad_u64_u32 v[7:8], s4, v22, v8, v[7:8]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_mul_i256:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
-; GFX11-NEXT: v_dual_mov_b32 v18, v8 :: v_dual_mov_b32 v19, v7
-; GFX11-NEXT: v_mul_lo_u32 v30, v4, v11
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v14, 0
-; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v16, v12, 0
-; GFX11-NEXT: v_mul_lo_u32 v29, v17, v14
-; GFX11-NEXT: v_mul_lo_u32 v28, v5, v10
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v17, v13, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[7:8], s0, v17, v11, v[7:8]
-; GFX11-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v12, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v2, v10, v[7:8]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v16, v10, 0
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v3, v9, v[7:8]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v10, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v4, v18, v[7:8]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v24, vcc_lo
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v9, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[22:23], null, v6, v18, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[20:21]
-; GFX11-NEXT: v_mov_b32_e32 v20, v8
-; GFX11-NEXT: v_cndmask_b32_e64 v26, 0, 1, s0
-; GFX11-NEXT: v_mov_b32_e32 v21, v22
-; GFX11-NEXT: v_mul_lo_u32 v22, v6, v9
-; GFX11-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v2, v18, v[0:1]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v16, v13, v[20:21]
-; GFX11-NEXT: v_mov_b32_e32 v6, v25
-; GFX11-NEXT: v_mul_lo_u32 v25, v16, v15
-; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v17, v12, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v16, v11, v[6:7]
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v18, 0
+; GFX11-NEXT: v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v19, v3
+; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v21, v5
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v16, v14, 0
+; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v23, v7
+; GFX11-NEXT: v_mov_b32_e32 v22, v8
+; GFX11-NEXT: v_mad_u64_u32 v[26:27], null, v16, v10, 0
+; GFX11-NEXT: v_mul_lo_u32 v28, v0, v9
+; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v17, v13, v[1:2]
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v16, v12, 0
+; GFX11-NEXT: v_mul_lo_u32 v30, v20, v11
+; GFX11-NEXT: v_mul_lo_u32 v15, v16, v15
+; GFX11-NEXT: v_mul_lo_u32 v14, v17, v14
+; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v18, v12, v[3:4]
+; GFX11-NEXT: v_mad_u64_u32 v[3:4], s0, v17, v11, v[1:2]
+; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v19, v11, v[5:6]
+; GFX11-NEXT: v_mad_u64_u32 v[5:6], vcc_lo, v18, v10, v[3:4]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v20, v10, v[1:2]
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v19, v9, v[5:6]
+; GFX11-NEXT: v_mad_u64_u32 v[24:25], null, v21, v9, v[3:4]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v7, vcc_lo
+; GFX11-NEXT: v_mad_u64_u32 v[4:5], vcc_lo, v20, v22, v[1:2]
+; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v0, v22, v[24:25]
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[26:27]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v3, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0
+; GFX11-NEXT: v_mul_lo_u32 v27, v21, v10
+; GFX11-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v16, v13, v[5:6]
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], s0, v18, v22, v[0:1]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v8, s0
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v22, 0
+; GFX11-NEXT: v_mad_u64_u32 v[5:6], s1, v17, v12, v[24:25]
+; GFX11-NEXT: v_mad_u64_u32 v[24:25], s2, v16, v11, v[3:4]
; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s2
-; GFX11-NEXT: v_mad_u64_u32 v[14:15], s1, v2, v11, v[20:21]
-; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v17, v10, v[6:7]
-; GFX11-NEXT: v_mul_lo_u32 v20, v2, v13
+; GFX11-NEXT: v_mad_u64_u32 v[3:4], s0, v18, v11, v[5:6]
+; GFX11-NEXT: v_mad_u64_u32 v[5:6], s2, v17, v10, v[24:25]
+; GFX11-NEXT: v_mul_lo_u32 v24, v19, v12
; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, v8, s2
-; GFX11-NEXT: v_mov_b32_e32 v11, v1
-; GFX11-NEXT: v_mad_u64_u32 v[13:14], s3, v3, v10, v[14:15]
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], s2, v2, v9, v[6:7]
-; GFX11-NEXT: v_mul_lo_u32 v21, v3, v12
-; GFX11-NEXT: v_mov_b32_e32 v12, v24
-; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v8, s2
-; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v4, v9, v[13:14]
-; GFX11-NEXT: v_mad_u64_u32 v[8:9], s4, v16, v9, v[11:12]
-; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4
-; GFX11-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v18, v[1:2]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v10, s4
-; GFX11-NEXT: v_mad_u64_u32 v[5:6], s4, v5, v18, v[6:7]
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v18, v[8:9]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v11, v3, s5
-; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v26, v4, s5
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v10, v5, s5
-; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v27, v6, s5
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v23, v25, s5
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v29, s4
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v20, s2
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v21, s3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v30, s1
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v28, vcc_lo
-; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v7, v22, s0
-; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v19, v18, v[9:10]
+; GFX11-NEXT: v_mul_lo_u32 v25, v18, v13
+; GFX11-NEXT: v_mad_u64_u32 v[11:12], s3, v19, v10, v[3:4]
+; GFX11-NEXT: v_mad_u64_u32 v[3:4], s2, v18, v9, v[5:6]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v8, s2
+; GFX11-NEXT: v_mad_u64_u32 v[5:6], s2, v20, v9, v[11:12]
+; GFX11-NEXT: v_mad_u64_u32 v[10:11], s4, v16, v9, v[1:2]
+; GFX11-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4
+; GFX11-NEXT: v_mad_u64_u32 v[8:9], s4, v19, v22, v[3:4]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v18, null, 0, v13, s4
+; GFX11-NEXT: v_mad_u64_u32 v[12:13], s4, v21, v22, v[5:6]
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v22, v[10:11]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v16, v8, s5
+; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v9, s5
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v18, v12, s5
+; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v13, s5
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v15, s5
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v14, s4
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v25, s2
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v24, s3
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v30, s0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v27, s1
+; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v7, v28, vcc_lo
+; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v23, v22, v[9:10]
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_mul_i256:
@@ -2752,102 +2758,100 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
-; GFX12-NEXT: v_mul_lo_u32 v27, v6, v9
-; GFX12-NEXT: v_mul_lo_u32 v28, v5, v10
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v14, 0
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v12, 0
-; GFX12-NEXT: v_mul_lo_u32 v30, v17, v14
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v16, v10, 0
+; GFX12-NEXT: v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v19, v3
+; GFX12-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v21, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v16, v14, 0
+; GFX12-NEXT: v_mov_b32_e32 v0, v6
+; GFX12-NEXT: v_mov_b32_e32 v22, v7
+; GFX12-NEXT: v_mad_co_u64_u32 v[25:26], null, v16, v10, 0
+; GFX12-NEXT: v_mul_lo_u32 v31, v17, v14
+; GFX12-NEXT: v_mul_lo_u32 v27, v0, v9
+; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v17, v13, v[1:2]
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v16, v12, 0
+; GFX12-NEXT: v_mul_lo_u32 v29, v20, v11
+; GFX12-NEXT: v_mul_lo_u32 v30, v16, v15
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1]
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
+; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v18, v12, v[3:4]
+; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s0, v17, v11, v[1:2]
+; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
+; GFX12-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1]
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
-; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v19, v11, v[5:6]
+; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], vcc_lo, v18, v10, v[3:4]
+; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1]
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0
-; GFX12-NEXT: v_mov_b32_e32 v20, v22
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v25, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v20, v10, v[1:2]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v19, v9, v[5:6]
+; GFX12-NEXT: v_mad_co_u64_u32 v[23:24], null, v21, v9, v[3:4]
+; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v7, vcc_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20]
-; GFX12-NEXT: v_mov_b32_e32 v19, v22
-; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
+; GFX12-NEXT: v_mad_co_u64_u32 v[4:5], vcc_lo, v20, v8, v[1:2]
+; GFX12-NEXT: v_mad_co_u64_u32 v[6:7], null, v0, v8, v[23:24]
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[25:26]
+; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v3, vcc_lo
+; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
+; GFX12-NEXT: v_cndmask_b32_e64 v28, 0, 1, s0
+; GFX12-NEXT: v_mul_lo_u32 v26, v21, v10
+; GFX12-NEXT: v_mad_co_u64_u32 v[23:24], vcc_lo, v16, v13, v[5:6]
+; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], s0, v18, v8, v[0:1]
+; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v28, null, 0, v28, s0
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0
-; GFX12-NEXT: v_mov_b32_e32 v20, v18
-; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[19:20]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s1, v17, v12, v[23:24]
+; GFX12-NEXT: v_mad_co_u64_u32 v[23:24], s2, v16, v11, v[3:4]
+; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
+; GFX12-NEXT: v_cndmask_b32_e64 v14, 0, 1, s2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s0, v18, v11, v[5:6]
+; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s2, v17, v10, v[23:24]
+; GFX12-NEXT: v_mul_lo_u32 v23, v19, v12
+; GFX12-NEXT: v_mul_lo_u32 v24, v18, v13
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s3, v19, v10, v[3:4]
+; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v14, s2
+; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s2, v18, v9, v[5:6]
+; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v14, null, 0, v10, s2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[24:25]
-; GFX12-NEXT: v_mul_lo_u32 v20, v4, v11
-; GFX12-NEXT: v_mul_lo_u32 v25, v3, v12
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2
-; GFX12-NEXT: v_mul_lo_u32 v24, v2, v13
-; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s2, v17, v10, v[14:15]
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s3, v3, v10, v[18:19]
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s2
-; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v21
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12]
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s2
-; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s2, v4, v9, v[18:19]
-; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v16, v9, v[13:14]
-; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s2, v20, v9, v[11:12]
+; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s4, v16, v9, v[1:2]
+; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4
-; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s4, v3, v8, v[1:2]
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v14, null, 0, v6, s4
-; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11]
-; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13]
-; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v19, v8, v[3:4]
+; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v16, null, 0, v14, s4
+; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s4, v21, v8, v[5:6]
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[10:11]
+; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v14, v5, s5
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v6, s5
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v23, v22, s5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v30, s4
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v24, s2
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v12, s5
+; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v28, v13, s5
+; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v16, v14, s5
+; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v25, v15, s5
+; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v30, s5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v25, s3
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v20, s1
-; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v31, s4
+; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v24, s2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, vcc_lo
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v27, s0
+; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v23, s3
+; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v29, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v26, s1
+; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v27, vcc_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10]
+; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v22, v8, v[7:8]
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_mul_i256:
@@ -2855,87 +2859,89 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
-; GFX1250-NEXT: v_mul_lo_u32 v27, v5, v10
-; GFX1250-NEXT: v_mul_lo_u32 v29, v3, v12
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mul_lo_u32 v30, v4, v11
+; GFX1250-NEXT: v_mul_lo_u32 v29, v5, v10
+; GFX1250-NEXT: v_mul_lo_u32 v31, v3, v12
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v14, 0
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v16, v12, 0
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v17, v13, v[0:1]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
-; GFX1250-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v2, v12, v[0:1]
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v16, v10, 0
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v3, v11, v[0:1]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
+; GFX1250-NEXT: v_mul_lo_u32 v32, v2, v13
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v17, v13, v[0:1]
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v12, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v2, v12, v[18:19]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v3, v11, v[20:21]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19]
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v4, v10, v[0:1]
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v4, v10, v[0:1]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v5, v9, v[0:1]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[22:23], v6, v8, v[0:1]
-; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
-; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
-; GFX1250-NEXT: v_mul_lo_u32 v22, v6, v9
-; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v5, v9, v[18:19]
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v16, v10, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1]
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v28, null, 0, v26, vcc_lo
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[0:1]
-; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[20:21]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[24:25], v6, v8, v[20:21]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[18:19]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_dual_mov_b32 v18, v23 :: v_dual_mov_b32 v19, v24
+; GFX1250-NEXT: v_mul_lo_u32 v24, v6, v9
+; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[18:19]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v2, v8, v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s0
; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v12, v[20:21]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_dual_mov_b32 v20, v25 :: v_dual_mov_b32 v21, v18
-; GFX1250-NEXT: v_mul_lo_u32 v25, v4, v11
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v16, v11, v[20:21]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_cndmask_b32_e64 v28, 0, 1, s2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
+; GFX1250-NEXT: v_mov_b32_e32 v13, v18
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], s2, v16, v11, v[20:21]
; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[0:1]
+; GFX1250-NEXT: v_cndmask_b32_e64 v11, 0, 1, s2
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v8, 0
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v17, v10, v[18:19]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21]
-; GFX1250-NEXT: v_mul_lo_u32 v20, v2, v13
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v28, s2
-; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19]
-; GFX1250-NEXT: v_dual_mov_b32 v18, v1 :: v_dual_mov_b32 v19, v24
-; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v21, s2
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v16, v9, v[18:19]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[26:27], s2, v17, v10, v[22:23]
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v33, null, 0, v11, s2
+; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], s3, v3, v10, v[20:21]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_mov_b32_e32 v12, v1
+; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s2, v2, v9, v[26:27]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s6, v16, v9, v[12:13]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s4, v4, v9, v[22:23]
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v33, s2
; GFX1250-NEXT: v_mul_lo_u32 v2, v16, v15
-; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13]
-; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
; GFX1250-NEXT: v_mul_lo_u32 v9, v17, v14
-; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[10:11]
+; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s2
-; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[18:19]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[18:19]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[20:21]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v3, s2, v3, v12, s2
; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v6, v13, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v1, v10, s2
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v26, v11, s2
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v28, v11, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v23, v2, s2
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v25, v2, s2
; GFX1250-NEXT: v_mov_b32_e32 v2, v15
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, s5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v20, s4
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v29, s3
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v32, s4
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v31, s3
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v25, s1
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v27, s0
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v30, s1
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v29, s0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v22, vcc_lo
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v24, vcc_lo
; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v1
; GFX1250-NEXT: v_mov_b32_e32 v1, v14
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -2949,60 +2955,61 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b64 s[0:1], 0
-; GFX7-NEXT: buffer_load_dword v2, v[2:3], s[0:3], 0 addr64
-; GFX7-NEXT: v_mov_b32_e32 v3, 0x50
+; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64
+; GFX7-NEXT: v_mov_b32_e32 v5, 0x50
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v3, 0
+; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: s_mul_u64_zext_with_vregs:
; GFX8: ; %bb.0:
-; GFX8-NEXT: flat_load_dword v2, v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v3, 0x50
+; GFX8-NEXT: flat_load_dword v4, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x50
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_u64_zext_with_vregs:
; GFX9: ; %bb.0:
-; GFX9-NEXT: global_load_dword v2, v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x50
+; GFX9-NEXT: global_load_dword v4, v[2:3], off
+; GFX9-NEXT: v_mov_b32_e32 v5, 0x50
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_mul_u64_zext_with_vregs:
; GFX10: ; %bb.0:
-; GFX10-NEXT: global_load_dword v2, v[2:3], off
+; GFX10-NEXT: global_load_dword v4, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v2, 0
+; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v4, 0
; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_mul_u64_zext_with_vregs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: global_load_b32 v2, v[2:3], off
+; GFX11-NEXT: global_load_b32 v4, v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v2, 0
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v4, 0
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: s_mul_u64_zext_with_vregs:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_load_b32 v2, v[2:3], off
+; GFX12-NEXT: global_load_b32 v4, v[2:3], off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0
+; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v4, 0
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX12-NEXT: s_endpgm
;
; GFX1250-LABEL: s_mul_u64_zext_with_vregs:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: global_load_b32 v2, v[2:3], off
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: global_load_b32 v4, v[2:3], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v2, 0
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v4, 0
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX1250-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in, align 4
@@ -3104,6 +3111,7 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr
;
; GFX1250-LABEL: s_mul_u64_zext_with_sregs:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
@@ -3130,33 +3138,36 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64
-; GFX7-NEXT: v_mov_b32_e32 v5, 0x50
+; GFX7-NEXT: v_mov_b32_e32 v6, 0x50
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
-; GFX7-NEXT: v_ashrrev_i32_e32 v4, 31, v4
-; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v5, v[3:4]
+; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0
+; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v4
+; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v6, v[3:4]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: s_mul_u64_sext_with_vregs:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_dword v4, v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v5, 0x50
+; GFX8-NEXT: v_mov_b32_e32 v6, 0x50
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
-; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v4
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4]
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0
+; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v4
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_u64_sext_with_vregs:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dword v4, v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v5, 0x50
+; GFX9-NEXT: v_mov_b32_e32 v6, 0x50
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
-; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v4
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4]
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0
+; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v4
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4]
+; GFX9-NEXT: v_mov_b32_e32 v3, v4
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: s_endpgm
;
@@ -3183,17 +3194,18 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
;
; GFX12-LABEL: s_mul_u64_sext_with_vregs:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_load_b32 v2, v[2:3], off
+; GFX12-NEXT: global_load_b32 v4, v[2:3], off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0
+; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v4, 0
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX12-NEXT: s_endpgm
;
; GFX1250-LABEL: s_mul_u64_sext_with_vregs:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: global_load_b32 v2, v[2:3], off
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: global_load_b32 v4, v[2:3], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v2, 0
+; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v4, 0
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX1250-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in, align 4
@@ -3310,6 +3322,7 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr
;
; GFX1250-LABEL: s_mul_u64_sext_with_sregs:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir
index 137488f..7ca3869 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir
@@ -24,7 +24,7 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]]
; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00
- ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %5(s32)
+ ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %5(s32)
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_FMED3 [[FMUL]], %5, [[COPY2]]
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32)
@@ -33,7 +33,7 @@ body: |
%2:vgpr(s32) = COPY %1(s32)
%3:vgpr(s32) = G_FMUL %0, %2
%4:sgpr(s32) = G_FCONSTANT float 1.000000e+00
- INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %5:vgpr_32
+ INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %5:vgpr_32
%6:vgpr(s32) = COPY %4(s32)
%7:vgpr(s32) = nnan G_AMDGPU_FMED3 %3(s32), %5(s32), %6(s32)
$vgpr0 = COPY %7(s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-wave-address.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-wave-address.mir
index f372c1f..59716a2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-wave-address.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-wave-address.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -o - %s | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -regbankselect-greedy -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -regbankselect-fast -o - %s | FileCheck %s
# TODO: We could use scalar
---
@@ -25,8 +25,7 @@ body: |
; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:sgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p5) = COPY [[AMDGPU_WAVE_ADDRESS]](p5)
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
- ; CHECK-NEXT: G_STORE [[COPY]](p5), [[COPY1]](p1) :: (store (p5), addrspace 1)
+ ; CHECK-NEXT: G_STORE [[COPY]](p5), [[DEF]](p1) :: (store (p5), addrspace 1)
%0:_(p1) = G_IMPLICIT_DEF
%1:_(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
G_STORE %1, %0 :: (store (p5), addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir
index a50c7fe..fc86dd8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -march amdgcn -mcpu=fiji -run-pass=regbankselect %s -o - | FileCheck %s
+# RUN: llc -O0 -march amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck %s
--- |
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fmul.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fmul.mir
index 5766c05..f289566 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fmul.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fmul.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
---
name: fmul_ss
@@ -17,6 +17,7 @@ body: |
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY2]], [[COPY3]]
+ ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[FMUL]]
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s32) = G_FMUL %0, %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
index d52b5fe..9034a94 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
@@ -296,9 +296,9 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_LOAD [[COPY]](p1) :: (invariant load (<8 x s32>), addrspace 1)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_LOAD [[COPY]](p1) :: (invariant load (<8 x s32>) from constant-pool, addrspace 1)
%0:_(p1) = COPY $sgpr0_sgpr1
- %1:_(<8 x s32>) = G_LOAD %0 :: (invariant load (<8 x s32>), addrspace 1)
+ %1:_(<8 x s32>) = G_LOAD %0 :: (invariant load (<8 x s32>) from constant-pool, addrspace 1)
...
---
@@ -313,9 +313,9 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s64>) = G_LOAD [[COPY]](p1) :: (invariant load (<4 x s64>), addrspace 1)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s64>) = G_LOAD [[COPY]](p1) :: (invariant load (<4 x s64>) from constant-pool, addrspace 1)
%0:_(p1) = COPY $sgpr0_sgpr1
- %1:_(<4 x s64>) = G_LOAD %0 :: (invariant load (<4 x s64>), addrspace 1)
+ %1:_(<4 x s64>) = G_LOAD %0 :: (invariant load (<4 x s64>) from constant-pool, addrspace 1)
...
---
@@ -330,9 +330,9 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_LOAD [[COPY]](p1) :: (invariant load (<16 x s32>), addrspace 1)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_LOAD [[COPY]](p1) :: (invariant load (<16 x s32>) from constant-pool, addrspace 1)
%0:_(p1) = COPY $sgpr0_sgpr1
- %1:_(<16 x s32>) = G_LOAD %0 :: (invariant load (<16 x s32>), addrspace 1)
+ %1:_(<16 x s32>) = G_LOAD %0 :: (invariant load (<16 x s32>) from constant-pool, addrspace 1)
...
---
@@ -347,9 +347,9 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s64>) = G_LOAD [[COPY]](p1) :: (invariant load (<8 x s64>), addrspace 1)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s64>) = G_LOAD [[COPY]](p1) :: (invariant load (<8 x s64>) from constant-pool, addrspace 1)
%0:_(p1) = COPY $sgpr0_sgpr1
- %1:_(<8 x s64>) = G_LOAD %0 :: (invariant load (<8 x s64>), addrspace 1)
+ %1:_(<8 x s64>) = G_LOAD %0 :: (invariant load (<8 x s64>) from constant-pool, addrspace 1)
...
---
@@ -603,9 +603,9 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_LOAD [[COPY]](p4) :: (load (<8 x s32>), addrspace 4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_LOAD [[COPY]](p4) :: (load (<8 x s32>) from constant-pool, addrspace 4)
%0:_(p4) = COPY $sgpr0_sgpr1
- %1:_(<8 x s32>) = G_LOAD %0 :: (load (<8 x s32>), addrspace 4)
+ %1:_(<8 x s32>) = G_LOAD %0 :: (load (<8 x s32>) from constant-pool, addrspace 4)
...
---
@@ -620,9 +620,9 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<16 x s16>) = G_LOAD [[COPY]](p4) :: (load (<16 x s16>), addrspace 4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<16 x s16>) = G_LOAD [[COPY]](p4) :: (load (<16 x s16>) from constant-pool, addrspace 4)
%0:_(p4) = COPY $sgpr0_sgpr1
- %1:_(<16 x s16>) = G_LOAD %0 :: (load (<16 x s16>), addrspace 4)
+ %1:_(<16 x s16>) = G_LOAD %0 :: (load (<16 x s16>) from constant-pool, addrspace 4)
...
---
@@ -637,9 +637,9 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s64>) = G_LOAD [[COPY]](p4) :: (load (<4 x s64>), addrspace 4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s64>) = G_LOAD [[COPY]](p4) :: (load (<4 x s64>) from constant-pool, addrspace 4)
%0:_(p4) = COPY $sgpr0_sgpr1
- %1:_(<4 x s64>) = G_LOAD %0 :: (load (<4 x s64>), addrspace 4)
+ %1:_(<4 x s64>) = G_LOAD %0 :: (load (<4 x s64>) from constant-pool, addrspace 4)
...
---
@@ -654,9 +654,9 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_LOAD [[COPY]](p4) :: (load (<16 x s32>), addrspace 4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_LOAD [[COPY]](p4) :: (load (<16 x s32>) from constant-pool, addrspace 4)
%0:_(p4) = COPY $sgpr0_sgpr1
- %1:_(<16 x s32>) = G_LOAD %0 :: (load (<16 x s32>), addrspace 4)
+ %1:_(<16 x s32>) = G_LOAD %0 :: (load (<16 x s32>) from constant-pool, addrspace 4)
...
---
@@ -671,9 +671,9 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s64>) = G_LOAD [[COPY]](p4) :: (load (<8 x s64>), addrspace 4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s64>) = G_LOAD [[COPY]](p4) :: (load (<8 x s64>) from constant-pool, addrspace 4)
%0:_(p4) = COPY $sgpr0_sgpr1
- %1:_(<8 x s64>) = G_LOAD %0 :: (load (<8 x s64>), addrspace 4)
+ %1:_(<8 x s64>) = G_LOAD %0 :: (load (<8 x s64>) from constant-pool, addrspace 4)
...
---
@@ -726,16 +726,16 @@ body: |
; GFX7: liveins: $sgpr0_sgpr1
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s8), addrspace 4)
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s8) from constant-pool, addrspace 4)
; GFX7-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
;
; GFX12-LABEL: name: extload_constant_i8_to_i32_uniform
; GFX12: liveins: $sgpr0_sgpr1
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s8), addrspace 4)
+ ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s8) from constant-pool, addrspace 4)
%0:_(p4) = COPY $sgpr0_sgpr1
- %1:_(s32) = G_LOAD %0 :: (load (s8), addrspace 4, align 1)
+ %1:_(s32) = G_LOAD %0 :: (load (s8) from constant-pool, addrspace 4, align 1)
...
---
@@ -751,10 +751,10 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s8), addrspace 1)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s8) from constant-pool, addrspace 1)
; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
%0:_(p4) = COPY $sgpr0_sgpr1
- %1:_(s32) = G_LOAD %0 :: (load (s8), addrspace 1, align 1)
+ %1:_(s32) = G_LOAD %0 :: (load (s8) from constant-pool, addrspace 1, align 1)
...
---
@@ -770,16 +770,16 @@ body: |
; GFX7: liveins: $sgpr0_sgpr1
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s16), addrspace 4)
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s16) from constant-pool, addrspace 4)
; GFX7-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
;
; GFX12-LABEL: name: extload_constant_i16_to_i32_uniform
; GFX12: liveins: $sgpr0_sgpr1
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s16), addrspace 4)
+ ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s16) from constant-pool, addrspace 4)
%0:_(p4) = COPY $sgpr0_sgpr1
- %1:_(s32) = G_LOAD %0 :: (load (s16), addrspace 4, align 2)
+ %1:_(s32) = G_LOAD %0 :: (load (s16) from constant-pool, addrspace 4, align 2)
...
---
@@ -795,10 +795,10 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s16), addrspace 1)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s16) from constant-pool, addrspace 1)
; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
%0:_(p4) = COPY $sgpr0_sgpr1
- %1:_(s32) = G_LOAD %0 :: (load (s16), addrspace 1, align 2)
+ %1:_(s32) = G_LOAD %0 :: (load (s16) from constant-pool, addrspace 1, align 2)
...
---
@@ -813,9 +813,9 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s32), addrspace 4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s32) from constant-pool, addrspace 4)
%0:_(p4) = COPY $sgpr0_sgpr1
- %1:_(s32) = G_LOAD %0 :: (load (s32), addrspace 4, align 4)
+ %1:_(s32) = G_LOAD %0 :: (load (s32) from constant-pool, addrspace 4, align 4)
...
---
@@ -831,10 +831,10 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s32), align 2, addrspace 4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s32) from constant-pool, align 2, addrspace 4)
; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
%0:_(p4) = COPY $sgpr0_sgpr1
- %1:_(s32) = G_LOAD %0 :: (load (s32), addrspace 4, align 2)
+ %1:_(s32) = G_LOAD %0 :: (load (s32) from constant-pool, addrspace 4, align 2)
...
---
@@ -850,10 +850,10 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s32), align 1, addrspace 4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s32) from constant-pool, align 1, addrspace 4)
; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
%0:_(p4) = COPY $sgpr0_sgpr1
- %1:_(s32) = G_LOAD %0 :: (load (s32), addrspace 4, align 1)
+ %1:_(s32) = G_LOAD %0 :: (load (s32) from constant-pool, addrspace 4, align 1)
...
---
@@ -888,13 +888,13 @@ body: |
; GCN: liveins: $vgpr0_vgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr(p4) = COPY $vgpr0_vgpr1
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>), align 32, addrspace 4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>) from constant-pool, align 32, addrspace 4)
; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64)
- ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from unknown-address + 16, addrspace 4)
+ ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from constant-pool + 16, basealign 32, addrspace 4)
; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
%0:_(p4) = COPY $vgpr0_vgpr1
- %1:_(<8 x s32>) = G_LOAD %0 :: (load (<8 x s32>), addrspace 4)
+ %1:_(<8 x s32>) = G_LOAD %0 :: (load (<8 x s32>) from constant-pool, addrspace 4)
...
---
@@ -916,10 +916,10 @@ body: |
; GCN-NEXT: successors: %bb.1(0x80000000)
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[PHI:%[0-9]+]]:vgpr(p4) = G_PHI [[COPY]](p4), %bb.0, %3(p4), %bb.1
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PHI]](p4) :: (load (<4 x s32>), align 32, addrspace 4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PHI]](p4) :: (load (<4 x s32>) from constant-pool, align 32, addrspace 4)
; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = nuw inbounds G_PTR_ADD [[PHI]], [[C]](s64)
- ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from unknown-address + 16, addrspace 4)
+ ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from constant-pool + 16, basealign 32, addrspace 4)
; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr(p4) = COPY [[COPY1]](p4)
; GCN-NEXT: G_BR %bb.1
@@ -933,7 +933,7 @@ body: |
bb.1:
%2:_(p4) = G_PHI %0, %bb.0, %4, %bb.1
- %3:_(<8 x s32>) = G_LOAD %2 :: (load (<8 x s32>), addrspace 4)
+ %3:_(<8 x s32>) = G_LOAD %2 :: (load (<8 x s32>) from constant-pool, addrspace 4)
%4:_(p4) = COPY %1
G_BR %bb.1
...
@@ -950,10 +950,10 @@ body: |
; GFX7: liveins: $sgpr0_sgpr1
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<2 x s32>), align 4, addrspace 4)
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<2 x s32>) from constant-pool, align 4, addrspace 4)
; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8
; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64)
- ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (s32) from unknown-address + 8, addrspace 4)
+ ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (s32) from constant-pool + 8, addrspace 4)
; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
; GFX7-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>)
@@ -962,10 +962,10 @@ body: |
; GFX12: liveins: $sgpr0_sgpr1
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<3 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<3 x s32>), align 4, addrspace 4)
+ ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<3 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<3 x s32>) from constant-pool, align 4, addrspace 4)
; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](<3 x s32>)
%0:_(p4) = COPY $sgpr0_sgpr1
- %1:_(<3 x s32>) = G_LOAD %0 :: (invariant load (<3 x s32>), addrspace 4, align 4)
+ %1:_(<3 x s32>) = G_LOAD %0 :: (invariant load (<3 x s32>) from constant-pool, addrspace 4, align 4)
S_ENDPGM 0, implicit %1
...
@@ -981,10 +981,10 @@ body: |
; GFX7: liveins: $sgpr0_sgpr1
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<2 x s32>), addrspace 4)
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<2 x s32>) from constant-pool, addrspace 4)
; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8
; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64)
- ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (s32) from unknown-address + 8, align 8, addrspace 4)
+ ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (s32) from constant-pool + 8, align 8, addrspace 4)
; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
; GFX7-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>)
@@ -993,10 +993,10 @@ body: |
; GFX12: liveins: $sgpr0_sgpr1
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<3 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<3 x s32>), align 8, addrspace 4)
+ ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<3 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<3 x s32>) from constant-pool, align 8, addrspace 4)
; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](<3 x s32>)
%0:_(p4) = COPY $sgpr0_sgpr1
- %1:_(<3 x s32>) = G_LOAD %0 :: (invariant load (<3 x s32>), addrspace 4, align 8)
+ %1:_(<3 x s32>) = G_LOAD %0 :: (invariant load (<3 x s32>) from constant-pool, addrspace 4, align 8)
S_ENDPGM 0, implicit %1
...
@@ -1012,7 +1012,7 @@ body: |
; GFX7: liveins: $sgpr0_sgpr1
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<4 x s32>), addrspace 4)
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<4 x s32>) from constant-pool, addrspace 4)
; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32)
; GFX7-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>)
@@ -1021,10 +1021,10 @@ body: |
; GFX12: liveins: $sgpr0_sgpr1
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<3 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<3 x s32>), align 16, addrspace 4)
+ ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<3 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<3 x s32>) from constant-pool, align 16, addrspace 4)
; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](<3 x s32>)
%0:_(p4) = COPY $sgpr0_sgpr1
- %1:_(<3 x s32>) = G_LOAD %0 :: (invariant load (<3 x s32>), addrspace 4, align 16)
+ %1:_(<3 x s32>) = G_LOAD %0 :: (invariant load (<3 x s32>) from constant-pool, addrspace 4, align 16)
S_ENDPGM 0, implicit %1
...
@@ -1040,10 +1040,10 @@ body: |
; GFX7: liveins: $sgpr0_sgpr1
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<4 x s16>), align 4, addrspace 4)
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<4 x s16>) from constant-pool, align 4, addrspace 4)
; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8
; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64)
- ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(<2 x s16>) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (<2 x s16>) from unknown-address + 8, addrspace 4)
+ ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(<2 x s16>) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (<2 x s16>) from constant-pool + 8, addrspace 4)
; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(<2 x s16>), [[UV1:%[0-9]+]]:sgpr(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:sgpr(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[UV1]](<2 x s16>), [[LOAD1]](<2 x s16>)
; GFX7-NEXT: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<6 x s16>)
@@ -1052,10 +1052,10 @@ body: |
; GFX12: liveins: $sgpr0_sgpr1
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<6 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<6 x s16>), align 4, addrspace 4)
+ ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<6 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<6 x s16>) from constant-pool, align 4, addrspace 4)
; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](<6 x s16>)
%0:_(p4) = COPY $sgpr0_sgpr1
- %1:_(<6 x s16>) = G_LOAD %0 :: (invariant load (<6 x s16>), addrspace 4, align 4)
+ %1:_(<6 x s16>) = G_LOAD %0 :: (invariant load (<6 x s16>) from constant-pool, addrspace 4, align 4)
S_ENDPGM 0, implicit %1
...
@@ -1071,10 +1071,10 @@ body: |
; GFX7: liveins: $sgpr0_sgpr1
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<4 x s16>), addrspace 4)
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<4 x s16>) from constant-pool, addrspace 4)
; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8
; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64)
- ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(<2 x s16>) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (<2 x s16>) from unknown-address + 8, align 8, addrspace 4)
+ ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(<2 x s16>) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (<2 x s16>) from constant-pool + 8, align 8, addrspace 4)
; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(<2 x s16>), [[UV1:%[0-9]+]]:sgpr(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:sgpr(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[UV1]](<2 x s16>), [[LOAD1]](<2 x s16>)
; GFX7-NEXT: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<6 x s16>)
@@ -1083,10 +1083,10 @@ body: |
; GFX12: liveins: $sgpr0_sgpr1
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<6 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<6 x s16>), align 8, addrspace 4)
+ ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<6 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<6 x s16>) from constant-pool, align 8, addrspace 4)
; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](<6 x s16>)
%0:_(p4) = COPY $sgpr0_sgpr1
- %1:_(<6 x s16>) = G_LOAD %0 :: (invariant load (<6 x s16>), addrspace 4, align 8)
+ %1:_(<6 x s16>) = G_LOAD %0 :: (invariant load (<6 x s16>) from constant-pool, addrspace 4, align 8)
S_ENDPGM 0, implicit %1
...
@@ -1102,7 +1102,7 @@ body: |
; GFX7: liveins: $sgpr0_sgpr1
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<8 x s16>), addrspace 4)
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<8 x s16>) from constant-pool, addrspace 4)
; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(<2 x s16>), [[UV1:%[0-9]+]]:sgpr(<2 x s16>), [[UV2:%[0-9]+]]:sgpr(<2 x s16>), [[UV3:%[0-9]+]]:sgpr(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<8 x s16>)
; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:sgpr(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[UV1]](<2 x s16>), [[UV2]](<2 x s16>)
; GFX7-NEXT: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<6 x s16>)
@@ -1111,10 +1111,10 @@ body: |
; GFX12: liveins: $sgpr0_sgpr1
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<6 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<6 x s16>), align 16, addrspace 4)
+ ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<6 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<6 x s16>) from constant-pool, align 16, addrspace 4)
; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](<6 x s16>)
%0:_(p4) = COPY $sgpr0_sgpr1
- %1:_(<6 x s16>) = G_LOAD %0 :: (invariant load (<6 x s16>), addrspace 4, align 16)
+ %1:_(<6 x s16>) = G_LOAD %0 :: (invariant load (<6 x s16>) from constant-pool, addrspace 4, align 16)
S_ENDPGM 0, implicit %1
...
@@ -1130,10 +1130,10 @@ body: |
; GFX7: liveins: $sgpr0_sgpr1
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(s64) = G_LOAD [[COPY]](p4) :: (invariant load (s64), align 4, addrspace 4)
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(s64) = G_LOAD [[COPY]](p4) :: (invariant load (s64) from constant-pool, align 4, addrspace 4)
; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8
; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64)
- ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (s32) from unknown-address + 8, addrspace 4)
+ ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (s32) from constant-pool + 8, addrspace 4)
; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](s64)
; GFX7-NEXT: [[MV:%[0-9]+]]:sgpr(s96) = G_MERGE_VALUES [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
; GFX7-NEXT: S_ENDPGM 0, implicit [[MV]](s96)
@@ -1142,10 +1142,10 @@ body: |
; GFX12: liveins: $sgpr0_sgpr1
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(s96) = G_LOAD [[COPY]](p4) :: (invariant load (s96), align 4, addrspace 4)
+ ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(s96) = G_LOAD [[COPY]](p4) :: (invariant load (s96) from constant-pool, align 4, addrspace 4)
; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](s96)
%0:_(p4) = COPY $sgpr0_sgpr1
- %1:_(s96) = G_LOAD %0 :: (invariant load (s96), addrspace 4, align 4)
+ %1:_(s96) = G_LOAD %0 :: (invariant load (s96) from constant-pool, addrspace 4, align 4)
S_ENDPGM 0, implicit %1
...
@@ -1161,10 +1161,10 @@ body: |
; GFX7: liveins: $sgpr0_sgpr1
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(s64) = G_LOAD [[COPY]](p4) :: (invariant load (s64), addrspace 4)
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(s64) = G_LOAD [[COPY]](p4) :: (invariant load (s64) from constant-pool, addrspace 4)
; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8
; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64)
- ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (s32) from unknown-address + 8, align 8, addrspace 4)
+ ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (s32) from constant-pool + 8, align 8, addrspace 4)
; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](s64)
; GFX7-NEXT: [[MV:%[0-9]+]]:sgpr(s96) = G_MERGE_VALUES [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
; GFX7-NEXT: S_ENDPGM 0, implicit [[MV]](s96)
@@ -1173,10 +1173,10 @@ body: |
; GFX12: liveins: $sgpr0_sgpr1
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(s96) = G_LOAD [[COPY]](p4) :: (invariant load (s96), align 8, addrspace 4)
+ ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(s96) = G_LOAD [[COPY]](p4) :: (invariant load (s96) from constant-pool, align 8, addrspace 4)
; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](s96)
%0:_(p4) = COPY $sgpr0_sgpr1
- %1:_(s96) = G_LOAD %0 :: (invariant load (s96), addrspace 4, align 8)
+ %1:_(s96) = G_LOAD %0 :: (invariant load (s96) from constant-pool, addrspace 4, align 8)
S_ENDPGM 0, implicit %1
...
@@ -1192,7 +1192,7 @@ body: |
; GFX7: liveins: $sgpr0_sgpr1
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(s128) = G_LOAD [[COPY]](p4) :: (invariant load (s128), addrspace 4)
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(s128) = G_LOAD [[COPY]](p4) :: (invariant load (s128) from constant-pool, addrspace 4)
; GFX7-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s96) = G_TRUNC [[LOAD]](s128)
; GFX7-NEXT: S_ENDPGM 0, implicit [[TRUNC]](s96)
;
@@ -1200,9 +1200,9 @@ body: |
; GFX12: liveins: $sgpr0_sgpr1
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(s96) = G_LOAD [[COPY]](p4) :: (invariant load (s96), align 16, addrspace 4)
+ ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(s96) = G_LOAD [[COPY]](p4) :: (invariant load (s96) from constant-pool, align 16, addrspace 4)
; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](s96)
%0:_(p4) = COPY $sgpr0_sgpr1
- %1:_(s96) = G_LOAD %0 :: (invariant load (s96), addrspace 4, align 16)
+ %1:_(s96) = G_LOAD %0 :: (invariant load (s96) from constant-pool, addrspace 4, align 16)
S_ENDPGM 0, implicit %1
...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll
index 5240bf4..e450da7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll
@@ -491,7 +491,7 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %
; OLD_RBS-NEXT: s_branch .LBB16_3
; OLD_RBS-NEXT: .LBB16_1: ; %Flow3
; OLD_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1
-; OLD_RBS-NEXT: s_waitcnt_depctr 0xffe3
+; OLD_RBS-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; OLD_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s3
; OLD_RBS-NEXT: s_andn2_b32 s1, s1, exec_lo
; OLD_RBS-NEXT: s_and_b32 s3, exec_lo, s4
@@ -547,13 +547,13 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %
;
; NEW_RBS-LABEL: loop_with_2breaks:
; NEW_RBS: ; %bb.0: ; %entry
-; NEW_RBS-NEXT: s_mov_b32 s4, 0
; NEW_RBS-NEXT: s_mov_b32 s0, 0
+; NEW_RBS-NEXT: s_mov_b32 s4, 0
; NEW_RBS-NEXT: ; implicit-def: $sgpr5
; NEW_RBS-NEXT: s_branch .LBB16_3
; NEW_RBS-NEXT: .LBB16_1: ; %Flow3
; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1
-; NEW_RBS-NEXT: s_waitcnt_depctr 0xffe3
+; NEW_RBS-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; NEW_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s7
; NEW_RBS-NEXT: s_andn2_b32 s2, s5, exec_lo
; NEW_RBS-NEXT: s_and_b32 s3, exec_lo, s6
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir
index b2ff0995c..cdc673e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir
@@ -35,10 +35,10 @@ body: |
; GFX7: liveins: $sgpr0_sgpr1
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), addrspace 4)
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>) from constant-pool, addrspace 4)
; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8
; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64)
- ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (load (s32) from unknown-address + 8, align 8, addrspace 4)
+ ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (load (s32) from constant-pool + 8, align 8, addrspace 4)
; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
; GFX7-NEXT: $sgpr0_sgpr1_sgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
@@ -47,10 +47,10 @@ body: |
; GFX12: liveins: $sgpr0_sgpr1
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<3 x s32>) = G_LOAD [[COPY]](p4) :: (load (<3 x s32>), align 8
+ ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<3 x s32>) = G_LOAD [[COPY]](p4) :: (load (<3 x s32>) from constant-pool, align 8, !range {{.+}}, addrspace 4)
; GFX12-NEXT: $sgpr0_sgpr1_sgpr2 = COPY [[LOAD]](<3 x s32>)
%0:_(p4) = COPY $sgpr0_sgpr1
- %1:_(<3 x s32>) = G_LOAD %0 :: (load (<3 x s32>), align 8, addrspace 4, !range !3)
+ %1:_(<3 x s32>) = G_LOAD %0 :: (load (<3 x s32>) from constant-pool, align 8, addrspace 4, !range !3)
$sgpr0_sgpr1_sgpr2 = COPY %1
...
@@ -66,10 +66,10 @@ body: |
; GFX7: liveins: $sgpr0_sgpr1
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), !tbaa !2, addrspace 4)
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>) from constant-pool, !tbaa !2, addrspace 4)
; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8
; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64)
- ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (load (s32) from unknown-address + 8, align 8, !tbaa !2, addrspace 4)
+ ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (load (s32) from constant-pool + 8, align 8, !tbaa !2, addrspace 4)
; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
; GFX7-NEXT: $sgpr0_sgpr1_sgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
@@ -78,10 +78,10 @@ body: |
; GFX12: liveins: $sgpr0_sgpr1
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<3 x s32>) = G_LOAD [[COPY]](p4) :: (load (<3 x s32>), align 8, !tbaa !2, addrspace 4)
+ ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<3 x s32>) = G_LOAD [[COPY]](p4) :: (load (<3 x s32>) from constant-pool, align 8, !tbaa !2, addrspace 4)
; GFX12-NEXT: $sgpr0_sgpr1_sgpr2 = COPY [[LOAD]](<3 x s32>)
%0:_(p4) = COPY $sgpr0_sgpr1
- %1:_(<3 x s32>) = G_LOAD %0 :: (load (<3 x s32>), align 8, addrspace 4, !tbaa !1)
+ %1:_(<3 x s32>) = G_LOAD %0 :: (load (<3 x s32>) from constant-pool, align 8, addrspace 4, !tbaa !1)
$sgpr0_sgpr1_sgpr2 = COPY %1
...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-widen-scalar-loads.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-widen-scalar-loads.mir
index 7838e97..70a2ddf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-widen-scalar-loads.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-widen-scalar-loads.mir
@@ -14,24 +14,24 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), align 8, addrspace 4)
+ ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, align 8, addrspace 4)
; GFX8-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
;
; GFX9-LABEL: name: constant_load_i8_align8
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), align 8, addrspace 4)
+ ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, align 8, addrspace 4)
; GFX9-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
;
; GFX10-LABEL: name: constant_load_i8_align8
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), align 8, addrspace 4)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, align 8, addrspace 4)
; GFX10-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
- %1:_(s32) = G_LOAD %0 :: (invariant load (s8), align 8, addrspace 4)
+ %1:_(s32) = G_LOAD %0 :: (invariant load (s8) from constant-pool, align 8, addrspace 4)
S_ENDPGM 0, implicit %1
...
---
@@ -45,24 +45,24 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4)
+ ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX8-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
;
; GFX9-LABEL: name: constant_load_i8_align4
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4)
+ ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX9-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
;
; GFX10-LABEL: name: constant_load_i8_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX10-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
- %1:_(s32) = G_LOAD %0 :: (invariant load (s8), align 4, addrspace 4)
+ %1:_(s32) = G_LOAD %0 :: (invariant load (s8) from constant-pool, align 4, addrspace 4)
S_ENDPGM 0, implicit %1
...
---
@@ -76,24 +76,24 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4)
+ ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX8-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
;
; GFX9-LABEL: name: constant_load_i16_align4
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4)
+ ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX9-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
;
; GFX10-LABEL: name: constant_load_i16_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX10-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
- %1:_(s32) = G_LOAD %0 :: (invariant load (s16), align 4, addrspace 4)
+ %1:_(s32) = G_LOAD %0 :: (invariant load (s16) from constant-pool, align 4, addrspace 4)
S_ENDPGM 0, implicit %1
...
---
@@ -107,7 +107,7 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4)
+ ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 8
; GFX8-NEXT: S_ENDPGM 0, implicit [[SEXT_INREG]](s32)
;
@@ -115,7 +115,7 @@ body: |
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4)
+ ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 8
; GFX9-NEXT: S_ENDPGM 0, implicit [[SEXT_INREG]](s32)
;
@@ -123,11 +123,11 @@ body: |
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX10-NEXT: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 8
; GFX10-NEXT: S_ENDPGM 0, implicit [[SEXT_INREG]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
- %1:_(s32) = G_SEXTLOAD %0 :: (invariant load (s8), align 4, addrspace 4)
+ %1:_(s32) = G_SEXTLOAD %0 :: (invariant load (s8) from constant-pool, align 4, addrspace 4)
S_ENDPGM 0, implicit %1
...
---
@@ -141,7 +141,7 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4)
+ ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 16
; GFX8-NEXT: S_ENDPGM 0, implicit [[SEXT_INREG]](s32)
;
@@ -149,7 +149,7 @@ body: |
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4)
+ ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 16
; GFX9-NEXT: S_ENDPGM 0, implicit [[SEXT_INREG]](s32)
;
@@ -157,11 +157,11 @@ body: |
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX10-NEXT: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 16
; GFX10-NEXT: S_ENDPGM 0, implicit [[SEXT_INREG]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
- %1:_(s32) = G_SEXTLOAD %0 :: (invariant load (s16), align 4, addrspace 4)
+ %1:_(s32) = G_SEXTLOAD %0 :: (invariant load (s16) from constant-pool, align 4, addrspace 4)
S_ENDPGM 0, implicit %1
...
@@ -176,7 +176,7 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4)
+ ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX8-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255
; GFX8-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
; GFX8-NEXT: S_ENDPGM 0, implicit [[AND]](s32)
@@ -185,7 +185,7 @@ body: |
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4)
+ ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX9-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255
; GFX9-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
; GFX9-NEXT: S_ENDPGM 0, implicit [[AND]](s32)
@@ -194,12 +194,12 @@ body: |
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX10-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255
; GFX10-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
; GFX10-NEXT: S_ENDPGM 0, implicit [[AND]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
- %1:_(s32) = G_ZEXTLOAD %0 :: (invariant load (s8), align 4, addrspace 4)
+ %1:_(s32) = G_ZEXTLOAD %0 :: (invariant load (s8) from constant-pool, align 4, addrspace 4)
S_ENDPGM 0, implicit %1
...
---
@@ -213,7 +213,7 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4)
+ ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX8-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
; GFX8-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
; GFX8-NEXT: S_ENDPGM 0, implicit [[AND]](s32)
@@ -222,7 +222,7 @@ body: |
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4)
+ ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX9-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
; GFX9-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
; GFX9-NEXT: S_ENDPGM 0, implicit [[AND]](s32)
@@ -231,12 +231,12 @@ body: |
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX10-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
; GFX10-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
; GFX10-NEXT: S_ENDPGM 0, implicit [[AND]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
- %1:_(s32) = G_ZEXTLOAD %0 :: (invariant load (s16), align 4, addrspace 4)
+ %1:_(s32) = G_ZEXTLOAD %0 :: (invariant load (s16) from constant-pool, align 4, addrspace 4)
S_ENDPGM 0, implicit %1
...
---
@@ -250,24 +250,24 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 1)
+ ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX8-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
;
; GFX9-LABEL: name: global_load_i8_align4
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 1)
+ ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX9-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
;
; GFX10-LABEL: name: global_load_i8_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 1)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX10-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
- %1:_(s32) = G_LOAD %0 :: (invariant load (s8), align 4, addrspace 1)
+ %1:_(s32) = G_LOAD %0 :: (invariant load (s8) from constant-pool, align 4, addrspace 1)
S_ENDPGM 0, implicit %1
...
---
@@ -281,24 +281,24 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 1)
+ ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX8-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
;
; GFX9-LABEL: name: global_load_i16_align4
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 1)
+ ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX9-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
;
; GFX10-LABEL: name: global_load_i16_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 1)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX10-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
- %1:_(s32) = G_LOAD %0 :: (invariant load (s16), align 4, addrspace 1)
+ %1:_(s32) = G_LOAD %0 :: (invariant load (s16) from constant-pool, align 4, addrspace 1)
S_ENDPGM 0, implicit %1
...
---
@@ -312,7 +312,7 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 1)
+ ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 8
; GFX8-NEXT: S_ENDPGM 0, implicit [[SEXT_INREG]](s32)
;
@@ -320,7 +320,7 @@ body: |
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 1)
+ ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 8
; GFX9-NEXT: S_ENDPGM 0, implicit [[SEXT_INREG]](s32)
;
@@ -328,11 +328,11 @@ body: |
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 1)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX10-NEXT: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 8
; GFX10-NEXT: S_ENDPGM 0, implicit [[SEXT_INREG]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
- %1:_(s32) = G_SEXTLOAD %0 :: (invariant load (s8), align 4, addrspace 1)
+ %1:_(s32) = G_SEXTLOAD %0 :: (invariant load (s8) from constant-pool, align 4, addrspace 1)
S_ENDPGM 0, implicit %1
...
---
@@ -346,7 +346,7 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 1)
+ ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX8-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
; GFX8-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
; GFX8-NEXT: S_ENDPGM 0, implicit [[AND]](s32)
@@ -355,7 +355,7 @@ body: |
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 1)
+ ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX9-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
; GFX9-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
; GFX9-NEXT: S_ENDPGM 0, implicit [[AND]](s32)
@@ -364,12 +364,12 @@ body: |
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 1)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32) from constant-pool, addrspace 4)
; GFX10-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
; GFX10-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
; GFX10-NEXT: S_ENDPGM 0, implicit [[AND]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
- %1:_(s32) = G_ZEXTLOAD %0 :: (invariant load (s16), align 4, addrspace 1)
+ %1:_(s32) = G_ZEXTLOAD %0 :: (invariant load (s16) from constant-pool, align 4, addrspace 1)
S_ENDPGM 0, implicit %1
...
# Some negative test cases
@@ -383,7 +383,7 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4)
+ ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s8) from constant-pool, align 2, addrspace 4)
; GFX8-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
; GFX8-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
;
@@ -391,7 +391,7 @@ body: |
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4)
+ ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s8) from constant-pool, align 2, addrspace 4)
; GFX9-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
; GFX9-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
;
@@ -399,11 +399,11 @@ body: |
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s8) from constant-pool, align 2, addrspace 4)
; GFX10-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
; GFX10-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
- %1:_(s32) = G_LOAD %0 :: (invariant load (s8), align 2, addrspace 4)
+ %1:_(s32) = G_LOAD %0 :: (invariant load (s8) from constant-pool, align 2, addrspace 4)
S_ENDPGM 0, implicit %1
...
---
@@ -417,7 +417,7 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4)
+ ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s16) from constant-pool, addrspace 4)
; GFX8-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
; GFX8-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
;
@@ -425,7 +425,7 @@ body: |
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4)
+ ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s16) from constant-pool, addrspace 4)
; GFX9-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
; GFX9-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
;
@@ -433,11 +433,11 @@ body: |
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s16) from constant-pool, addrspace 4)
; GFX10-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
; GFX10-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
- %1:_(s32) = G_LOAD %0 :: (invariant load (s16), align 2, addrspace 4)
+ %1:_(s32) = G_LOAD %0 :: (invariant load (s16) from constant-pool, align 2, addrspace 4)
S_ENDPGM 0, implicit %1
...
---
@@ -451,7 +451,7 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4)
+ ; GFX8-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p1) :: (invariant load (s8) from constant-pool, align 2, addrspace 4)
; GFX8-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]]
; GFX8-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
;
@@ -459,7 +459,7 @@ body: |
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX9-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4)
+ ; GFX9-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p1) :: (invariant load (s8) from constant-pool, align 2, addrspace 4)
; GFX9-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]]
; GFX9-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
;
@@ -467,11 +467,11 @@ body: |
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4)
+ ; GFX10-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p1) :: (invariant load (s8) from constant-pool, align 2, addrspace 4)
; GFX10-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]]
; GFX10-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
- %1:_(s32) = G_SEXTLOAD %0 :: (invariant load (s8), align 2, addrspace 4)
+ %1:_(s32) = G_SEXTLOAD %0 :: (invariant load (s8) from constant-pool, align 2, addrspace 4)
S_ENDPGM 0, implicit %1
...
---
@@ -485,7 +485,7 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4)
+ ; GFX8-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p1) :: (invariant load (s16) from constant-pool, addrspace 4)
; GFX8-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]]
; GFX8-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
;
@@ -493,7 +493,7 @@ body: |
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX9-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4)
+ ; GFX9-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p1) :: (invariant load (s16) from constant-pool, addrspace 4)
; GFX9-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]]
; GFX9-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
;
@@ -501,11 +501,11 @@ body: |
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4)
+ ; GFX10-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p1) :: (invariant load (s16) from constant-pool, addrspace 4)
; GFX10-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]]
; GFX10-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
- %1:_(s32) = G_SEXTLOAD %0 :: (invariant load (s16), align 2, addrspace 4)
+ %1:_(s32) = G_SEXTLOAD %0 :: (invariant load (s16) from constant-pool, align 2, addrspace 4)
S_ENDPGM 0, implicit %1
...
---
@@ -519,7 +519,7 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4)
+ ; GFX8-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p1) :: (invariant load (s8) from constant-pool, align 2, addrspace 4)
; GFX8-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]]
; GFX8-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
;
@@ -527,7 +527,7 @@ body: |
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4)
+ ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p1) :: (invariant load (s8) from constant-pool, align 2, addrspace 4)
; GFX9-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]]
; GFX9-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
;
@@ -535,11 +535,11 @@ body: |
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4)
+ ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p1) :: (invariant load (s8) from constant-pool, align 2, addrspace 4)
; GFX10-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]]
; GFX10-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
- %1:_(s32) = G_ZEXTLOAD %0 :: (invariant load (s8), align 2, addrspace 4)
+ %1:_(s32) = G_ZEXTLOAD %0 :: (invariant load (s8) from constant-pool, align 2, addrspace 4)
S_ENDPGM 0, implicit %1
...
---
@@ -553,7 +553,7 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4)
+ ; GFX8-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p1) :: (invariant load (s16) from constant-pool, addrspace 4)
; GFX8-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]]
; GFX8-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
;
@@ -561,7 +561,7 @@ body: |
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4)
+ ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p1) :: (invariant load (s16) from constant-pool, addrspace 4)
; GFX9-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]]
; GFX9-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
;
@@ -569,11 +569,11 @@ body: |
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4)
+ ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p1) :: (invariant load (s16) from constant-pool, addrspace 4)
; GFX10-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]]
; GFX10-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
- %1:_(s32) = G_ZEXTLOAD %0 :: (invariant load (s16), align 2, addrspace 4)
+ %1:_(s32) = G_ZEXTLOAD %0 :: (invariant load (s16) from constant-pool, align 2, addrspace 4)
S_ENDPGM 0, implicit %1
...
---
@@ -587,7 +587,7 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 3)
+ ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (load (s8) from constant-pool, align 4, addrspace 3)
; GFX8-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
; GFX8-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
;
@@ -595,7 +595,7 @@ body: |
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 3)
+ ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (load (s8) from constant-pool, align 4, addrspace 3)
; GFX9-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
; GFX9-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
;
@@ -603,11 +603,11 @@ body: |
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 3)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (load (s8) from constant-pool, align 4, addrspace 3)
; GFX10-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
; GFX10-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
- %1:_(s32) = G_LOAD %0 :: (load (s8), align 4, addrspace 3)
+ %1:_(s32) = G_LOAD %0 :: (load (s8) from constant-pool, align 4, addrspace 3)
S_ENDPGM 0, implicit %1
...
---
@@ -622,7 +622,7 @@ body: |
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load (s8), align 4, addrspace 5)
+ ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load (s8) from constant-pool, align 4, addrspace 5)
; GFX8-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
;
; GFX9-LABEL: name: private_load_i8_align4
@@ -630,7 +630,7 @@ body: |
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load (s8), align 4, addrspace 5)
+ ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load (s8) from constant-pool, align 4, addrspace 5)
; GFX9-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
;
; GFX10-LABEL: name: private_load_i8_align4
@@ -638,9 +638,9 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load (s8), align 4, addrspace 5)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load (s8) from constant-pool, align 4, addrspace 5)
; GFX10-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
- %1:_(s32) = G_LOAD %0 :: (load (s8), align 4, addrspace 5)
+ %1:_(s32) = G_LOAD %0 :: (load (s8) from constant-pool, align 4, addrspace 5)
S_ENDPGM 0, implicit %1
...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 4f2c454..b7c84f1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -31,128 +31,126 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
; CHECK-NEXT: v_xor_b32_e32 v1, v3, v0
; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v2
; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1
-; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v2
-; CHECK-NEXT: v_subb_u32_e32 v11, vcc, 0, v1, vcc
+; CHECK-NEXT: v_sub_i32_e32 v13, vcc, 0, v2
+; CHECK-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc
; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v6
; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3
; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v3
-; CHECK-NEXT: v_trunc_f32_e32 v8, v6
-; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v8
-; CHECK-NEXT: v_cvt_u32_f32_e32 v9, v3
-; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v8
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0
-; CHECK-NEXT: v_mov_b32_e32 v3, v7
-; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4]
-; CHECK-NEXT: v_mul_lo_u32 v3, v12, v6
-; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; CHECK-NEXT: v_mul_hi_u32 v8, v9, v6
+; CHECK-NEXT: v_trunc_f32_e32 v6, v6
+; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v6
+; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3
+; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v6
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v3, 0
+; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v12, v[7:8]
+; CHECK-NEXT: v_mul_lo_u32 v7, v12, v6
+; CHECK-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v3, v[8:9]
+; CHECK-NEXT: v_mul_hi_u32 v8, v3, v6
; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6
-; CHECK-NEXT: v_mul_lo_u32 v13, v9, v7
-; CHECK-NEXT: v_mul_lo_u32 v14, v12, v7
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v13
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8
-; CHECK-NEXT: v_mul_hi_u32 v8, v9, v7
-; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v13, v3
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v14, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v9, v3, v10
+; CHECK-NEXT: v_mul_lo_u32 v11, v12, v10
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; CHECK-NEXT: v_mul_hi_u32 v8, v3, v10
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v11, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8
; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v13, v8
-; CHECK-NEXT: v_mul_hi_u32 v7, v12, v7
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v3
-; CHECK-NEXT: v_addc_u32_e32 v12, vcc, v12, v6, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0
-; CHECK-NEXT: v_mov_b32_e32 v3, v7
-; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4]
-; CHECK-NEXT: v_ashrrev_i32_e32 v10, 31, v5
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v10
-; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v10, vcc
-; CHECK-NEXT: v_xor_b32_e32 v8, v3, v10
-; CHECK-NEXT: v_mul_lo_u32 v3, v12, v6
-; CHECK-NEXT: v_mul_lo_u32 v5, v9, v7
-; CHECK-NEXT: v_xor_b32_e32 v11, v4, v10
-; CHECK-NEXT: v_mul_hi_u32 v4, v9, v6
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT: v_mul_hi_u32 v9, v12, v10
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT: v_addc_u32_e32 v12, vcc, v12, v7, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v3, 0
+; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v12, v[7:8]
+; CHECK-NEXT: v_ashrrev_i32_e32 v13, 31, v5
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v13
+; CHECK-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v3, v[8:9]
+; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v13, vcc
+; CHECK-NEXT: v_xor_b32_e32 v9, v4, v13
+; CHECK-NEXT: v_mul_lo_u32 v4, v12, v6
+; CHECK-NEXT: v_mul_lo_u32 v7, v3, v10
+; CHECK-NEXT: v_xor_b32_e32 v11, v5, v13
+; CHECK-NEXT: v_mul_hi_u32 v5, v3, v6
; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v4, v12, v7
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
-; CHECK-NEXT: v_mul_hi_u32 v5, v9, v7
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_mul_hi_u32 v6, v12, v7
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v5, v12, v10
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT: v_mul_hi_u32 v7, v3, v10
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT: v_mul_hi_u32 v7, v12, v10
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v9, v3
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v12, v5, vcc
; CHECK-NEXT: v_mul_lo_u32 v5, v11, v3
-; CHECK-NEXT: v_mul_lo_u32 v6, v8, v4
-; CHECK-NEXT: v_mul_hi_u32 v7, v8, v3
+; CHECK-NEXT: v_mul_lo_u32 v6, v9, v4
+; CHECK-NEXT: v_mul_hi_u32 v7, v9, v3
; CHECK-NEXT: v_mul_hi_u32 v3, v11, v3
-; CHECK-NEXT: v_mul_hi_u32 v9, v11, v4
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_mul_lo_u32 v7, v11, v4
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_mul_hi_u32 v6, v8, v4
+; CHECK-NEXT: v_mul_hi_u32 v6, v9, v4
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v3, v5
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v7, 0
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v9, v5
-; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v8, v3
-; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v7, v[4:5]
-; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v11, v4, vcc
-; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v11, v4
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v3, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v3
+; CHECK-NEXT: v_mul_hi_u32 v6, v11, v4
+; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v10, 0
+; CHECK-NEXT: v_add_i32_e32 v12, vcc, v6, v5
+; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v12, v[4:5]
+; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v9, v3
+; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v10, v[5:6]
+; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v11, v7, vcc
+; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v11, v7
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2
; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5]
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, 1, v7
-; CHECK-NEXT: v_addc_u32_e32 v9, vcc, 0, v6, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1
-; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v10
+; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v2, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v8
-; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v9, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v3, v10, v0
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v6
+; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v7, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc
+; CHECK-NEXT: v_xor_b32_e32 v3, v13, v0
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3
; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
@@ -218,67 +216,67 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: s_subb_u32 s5, 0, s11
; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
-; CHECK-NEXT: v_trunc_f32_e32 v2, v1
-; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v0
-; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v2
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT: v_mul_hi_u32 v5, v3, v0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT: v_mul_lo_u32 v6, v3, v1
-; CHECK-NEXT: v_mul_lo_u32 v7, v4, v1
-; CHECK-NEXT: v_mul_hi_u32 v8, v3, v1
-; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT: v_trunc_f32_e32 v1, v1
+; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
+; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v0
+; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v1
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT: v_mul_hi_u32 v2, v6, v0
+; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT: v_mul_lo_u32 v3, v6, v4
+; CHECK-NEXT: v_mul_lo_u32 v5, v7, v4
+; CHECK-NEXT: v_mul_hi_u32 v8, v6, v4
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v0
+; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT: v_mul_hi_u32 v3, v6, v0
+; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT: v_mul_lo_u32 v2, v6, v4
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT: v_mul_hi_u32 v6, v3, v0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT: v_mul_lo_u32 v5, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v6, v4, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_mul_hi_u32 v5, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v3, v7, v4
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT: v_mul_hi_u32 v2, v6, v4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; CHECK-NEXT: v_mul_lo_u32 v2, s13, v0
; CHECK-NEXT: v_mul_lo_u32 v3, s12, v1
; CHECK-NEXT: v_mul_hi_u32 v4, s12, v0
; CHECK-NEXT: v_mul_hi_u32 v0, s13, v0
-; CHECK-NEXT: v_mul_hi_u32 v5, s13, v1
+; CHECK-NEXT: v_mov_b32_e32 v7, s13
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
@@ -291,39 +289,39 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v0, v2
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v2, v[1:2]
-; CHECK-NEXT: v_mov_b32_e32 v5, s13
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v0, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v0
+; CHECK-NEXT: v_mul_hi_u32 v3, s13, v1
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v3, v2
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v4, v[1:2]
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, s12, v0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2]
-; CHECK-NEXT: v_mov_b32_e32 v3, s11
-; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v5, v1, vcc
-; CHECK-NEXT: v_sub_i32_e64 v1, s[0:1], s13, v1
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
+; CHECK-NEXT: v_mov_b32_e32 v1, s11
+; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v7, v4, vcc
+; CHECK-NEXT: v_sub_i32_e64 v3, s[0:1], s13, v4
; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v2
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v0
; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0
; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v4
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v2
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s11, v1
-; CHECK-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1]
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v2, v4, v5, s[0:1]
+; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s10, v0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s11, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v3
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
; CHECK-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9]
; CHECK-NEXT: v_xor_b32_e32 v0, s0, v0
; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0
@@ -379,266 +377,260 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc
; GISEL-NEXT: v_xor_b32_e32 v10, v4, v8
-; GISEL-NEXT: v_xor_b32_e32 v4, v5, v8
-; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v10
-; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4
-; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v10
-; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v4, vcc
-; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v9
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v5
-; GISEL-NEXT: v_trunc_f32_e32 v9, v9
-; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9
-; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v5
-; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
-; GISEL-NEXT: v_mov_b32_e32 v5, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v9, v[5:6]
-; GISEL-NEXT: v_mul_lo_u32 v5, v9, v11
-; GISEL-NEXT: v_mul_hi_u32 v17, v14, v11
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
-; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11
-; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v17
+; GISEL-NEXT: v_xor_b32_e32 v9, v5, v8
+; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v10
+; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v9
+; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v10
+; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v9, vcc
+; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
+; GISEL-NEXT: v_trunc_f32_e32 v5, v5
+; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v5
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v17, v15, 0
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, v[5:6]
+; GISEL-NEXT: v_mul_lo_u32 v5, v16, v4
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v18, v15, v[11:12]
+; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v15, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v4, v16, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v17, v9, v12
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
-; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v17, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13
-; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5
+; GISEL-NEXT: v_mul_lo_u32 v11, v16, v13
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT: v_mul_hi_u32 v12, v15, v13
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v5
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v11, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
-; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT: v_mov_b32_e32 v5, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[5:6]
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
-; GISEL-NEXT: v_xor_b32_e32 v15, v0, v9
-; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11
-; GISEL-NEXT: v_mul_lo_u32 v5, v14, v12
-; GISEL-NEXT: v_xor_b32_e32 v16, v1, v9
-; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11
-; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v16, v13
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4
+; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v16, v5, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v17, v15, 0
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, v[5:6]
+; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v18, v15, v[11:12]
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc
+; GISEL-NEXT: v_xor_b32_e32 v17, v0, v5
+; GISEL-NEXT: v_mul_lo_u32 v0, v16, v4
+; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13
+; GISEL-NEXT: v_xor_b32_e32 v18, v1, v5
+; GISEL-NEXT: v_mul_hi_u32 v1, v15, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v16, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT: v_mul_hi_u32 v5, v14, v12
+; GISEL-NEXT: v_mul_lo_u32 v1, v16, v13
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT: v_mul_hi_u32 v11, v15, v13
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5
-; GISEL-NEXT: v_mul_hi_u32 v11, v17, v12
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v16, v13
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v16, v0
-; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1
-; GISEL-NEXT: v_mul_hi_u32 v12, v15, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0
-; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v4, v18, v0
+; GISEL-NEXT: v_mul_lo_u32 v11, v17, v1
+; GISEL-NEXT: v_mul_hi_u32 v12, v17, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0
+; GISEL-NEXT: v_xor_b32_e32 v5, v5, v8
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v12, v16, v1
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5
-; GISEL-NEXT: v_mul_hi_u32 v11, v15, v1
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v12, v18, v1
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4
+; GISEL-NEXT: v_mul_hi_u32 v11, v17, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v12, v11
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v0, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v5
-; GISEL-NEXT: v_mov_b32_e32 v1, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v10, v14, v[1:2]
-; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v7
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v5
-; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v0, v[12:13]
-; GISEL-NEXT: v_xor_b32_e32 v7, v1, v5
-; GISEL-NEXT: v_xor_b32_e32 v6, v6, v5
-; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7
-; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6
-; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v11
-; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v16, v12
-; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v13
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1
-; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v12, vcc
-; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v11, v4, vcc
-; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1
-; GISEL-NEXT: v_trunc_f32_e32 v16, v11
-; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v16
-; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1
-; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v7
-; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v6, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0
-; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v10
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v10
-; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v10
-; GISEL-NEXT: v_mov_b32_e32 v1, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v4
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13]
-; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v4
-; GISEL-NEXT: v_mul_lo_u32 v10, v18, v12
-; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10
-; GISEL-NEXT: v_mul_hi_u32 v10, v18, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7]
-; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11
-; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9]
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v4
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9]
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v4, v15, v21, s[4:5]
-; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v10
-; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v17, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e32 v15, v15, v21, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10
-; GISEL-NEXT: v_mul_lo_u32 v13, v16, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT: v_mul_hi_u32 v13, v18, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT: v_mul_hi_u32 v12, v16, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v18, v10
-; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v16, v11, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v19, v12, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
-; GISEL-NEXT: v_mov_b32_e32 v0, v11
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v13, v[0:1]
-; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[0:1]
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v11
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v11, vcc
-; GISEL-NEXT: v_xor_b32_e32 v15, v1, v11
-; GISEL-NEXT: v_mul_lo_u32 v1, v13, v10
-; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0
-; GISEL-NEXT: v_xor_b32_e32 v16, v2, v11
-; GISEL-NEXT: v_mul_hi_u32 v2, v12, v10
-; GISEL-NEXT: v_xor_b32_e32 v9, v4, v8
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0
-; GISEL-NEXT: v_mul_hi_u32 v4, v13, v10
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v0
+; GISEL-NEXT: v_mul_hi_u32 v11, v18, v1
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v15, 0
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v11, v4
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v16, v[1:2]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v17, v0
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v9, v15, v[11:12]
+; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v18, v13, vcc
+; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v18, v13
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v9
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v4, v9, vcc
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v15
+; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v16, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v9
+; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v4
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v7, v4, vcc
+; GISEL-NEXT: v_xor_b32_e32 v7, v6, v4
+; GISEL-NEXT: v_xor_b32_e32 v6, v17, v4
+; GISEL-NEXT: v_cvt_f32_u32_e32 v17, v7
+; GISEL-NEXT: v_cvt_f32_u32_e32 v18, v6
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v9
+; GISEL-NEXT: v_mac_f32_e32 v17, 0x4f800000, v18
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v17
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v14, v0, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v12
+; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v13, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1
+; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
+; GISEL-NEXT: v_trunc_f32_e32 v1, v1
+; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
+; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0
+; GISEL-NEXT: v_sub_i32_e64 v19, s[4:5], 0, v7
+; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1
+; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], 0, v6, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v17, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v9, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v14, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v18, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v21, v18, v0
+; GISEL-NEXT: v_mul_hi_u32 v22, v17, v0
+; GISEL-NEXT: v_mul_hi_u32 v23, v18, v0
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v17, v[9:10]
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
+; GISEL-NEXT: v_cndmask_b32_e32 v10, v15, v12, vcc
+; GISEL-NEXT: v_mul_lo_u32 v1, v17, v0
+; GISEL-NEXT: v_mul_lo_u32 v11, v18, v0
+; GISEL-NEXT: v_ashrrev_i32_e32 v15, 31, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc
+; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v21, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v22
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1
+; GISEL-NEXT: v_mul_hi_u32 v9, v17, v0
+; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v23
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11
+; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0
+; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9
+; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v9
+; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v17, v1
+; GISEL-NEXT: v_addc_u32_e64 v14, s[4:5], v18, v0, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v12, 0
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v14, v[1:2]
+; GISEL-NEXT: v_xor_b32_e32 v1, v10, v5
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v15
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v20, v12, v[8:9]
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v15, vcc
+; GISEL-NEXT: v_xor_b32_e32 v16, v2, v15
+; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0
+; GISEL-NEXT: v_mul_lo_u32 v8, v12, v10
+; GISEL-NEXT: v_xor_b32_e32 v17, v3, v15
; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1
-; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v13, v0, vcc
-; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1
-; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0
-; GISEL-NEXT: v_mul_hi_u32 v4, v15, v1
-; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1
-; GISEL-NEXT: v_xor_b32_e32 v10, v14, v8
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; GISEL-NEXT: v_mul_lo_u32 v3, v14, v10
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2
+; GISEL-NEXT: v_mul_hi_u32 v8, v12, v10
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
+; GISEL-NEXT: v_mul_hi_u32 v8, v14, v10
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT: v_mul_hi_u32 v3, v15, v0
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v3, v17, v0
+; GISEL-NEXT: v_mul_lo_u32 v8, v16, v2
+; GISEL-NEXT: v_mul_hi_u32 v9, v16, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0
+; GISEL-NEXT: v_xor_b32_e32 v10, v13, v5
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v3
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v1, v2
-; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0
+; GISEL-NEXT: v_mul_lo_u32 v9, v17, v2
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3
+; GISEL-NEXT: v_mul_hi_u32 v8, v16, v2
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT: v_mul_hi_u32 v8, v17, v2
; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1
-; GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[0:1]
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v9, v8
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v12, v[3:4]
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v8, vcc
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v15, v2
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc
-; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v8, v0
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v13, v[3:4]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v5
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v5, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v12, v[8:9]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v16, v2
+; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v17, v10, vcc
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v17, v10
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v6
+; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v6, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6
-; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v4, v8, v9, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v6
+; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v9, s[4:5]
; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v12
; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v5, v6
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v8
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v8
; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v6, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
-; GISEL-NEXT: v_xor_b32_e32 v4, v11, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GISEL-NEXT: v_xor_b32_e32 v4, v15, v4
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v5, vcc
; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4
; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
@@ -667,100 +659,100 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_xor_b32_e32 v1, v3, v0
; CGP-NEXT: v_cvt_f32_u32_e32 v3, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v1
-; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v2
-; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc
+; CGP-NEXT: v_sub_i32_e32 v16, vcc, 0, v2
+; CGP-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc
; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4
; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3
; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3
-; CGP-NEXT: v_trunc_f32_e32 v5, v4
-; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5
-; CGP-NEXT: v_cvt_u32_f32_e32 v12, v3
-; CGP-NEXT: v_cvt_u32_f32_e32 v15, v5
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5]
-; CGP-NEXT: v_mul_hi_u32 v16, v12, v3
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5]
-; CGP-NEXT: v_mul_lo_u32 v5, v15, v3
-; CGP-NEXT: v_mul_hi_u32 v3, v15, v3
-; CGP-NEXT: v_mul_lo_u32 v17, v12, v4
-; CGP-NEXT: v_mul_lo_u32 v18, v15, v4
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v17
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16
-; CGP-NEXT: v_mul_hi_u32 v16, v12, v4
+; CGP-NEXT: v_trunc_f32_e32 v4, v4
+; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4
+; CGP-NEXT: v_cvt_u32_f32_e32 v15, v3
+; CGP-NEXT: v_cvt_u32_f32_e32 v14, v4
+; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v15, 0
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5]
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v17, v15, v[12:13]
+; CGP-NEXT: v_mul_lo_u32 v5, v14, v3
+; CGP-NEXT: v_mul_hi_u32 v12, v15, v3
+; CGP-NEXT: v_mul_lo_u32 v13, v15, v4
+; CGP-NEXT: v_mul_hi_u32 v3, v14, v3
+; CGP-NEXT: v_mul_lo_u32 v18, v14, v4
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12
+; CGP-NEXT: v_mul_hi_u32 v12, v15, v4
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v17, v5
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5
; CGP-NEXT: v_add_i32_e32 v3, vcc, v18, v3
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v16
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16
-; CGP-NEXT: v_mul_hi_u32 v4, v15, v4
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT: v_mul_hi_u32 v4, v14, v4
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v3
-; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v4, vcc
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5]
-; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v11
-; CGP-NEXT: v_mul_hi_u32 v16, v12, v3
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5]
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v13
-; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v13, vcc
-; CGP-NEXT: v_xor_b32_e32 v11, v5, v13
-; CGP-NEXT: v_mul_lo_u32 v5, v15, v3
-; CGP-NEXT: v_mul_lo_u32 v14, v12, v4
-; CGP-NEXT: v_mul_hi_u32 v3, v15, v3
-; CGP-NEXT: v_xor_b32_e32 v10, v10, v13
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v3
+; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v4, vcc
+; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v15, 0
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5]
+; CGP-NEXT: v_ashrrev_i32_e32 v16, 31, v11
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v17, v15, v[12:13]
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v16
+; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v16, vcc
+; CGP-NEXT: v_xor_b32_e32 v12, v5, v16
+; CGP-NEXT: v_mul_lo_u32 v5, v14, v3
+; CGP-NEXT: v_mul_lo_u32 v11, v15, v4
+; CGP-NEXT: v_xor_b32_e32 v13, v10, v16
+; CGP-NEXT: v_mul_hi_u32 v10, v15, v3
+; CGP-NEXT: v_mul_hi_u32 v3, v14, v3
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v16, v15, v4
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5
-; CGP-NEXT: v_mul_hi_u32 v14, v12, v4
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v16, v3
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14
-; CGP-NEXT: v_mul_hi_u32 v4, v15, v4
+; CGP-NEXT: v_mul_lo_u32 v10, v14, v4
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5
+; CGP-NEXT: v_mul_hi_u32 v11, v15, v4
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v3
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT: v_mul_hi_u32 v4, v14, v4
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3
-; CGP-NEXT: v_addc_u32_e32 v4, vcc, v15, v4, vcc
-; CGP-NEXT: v_mul_lo_u32 v5, v10, v3
-; CGP-NEXT: v_mul_lo_u32 v12, v11, v4
-; CGP-NEXT: v_mul_hi_u32 v14, v11, v3
-; CGP-NEXT: v_mul_hi_u32 v3, v10, v3
-; CGP-NEXT: v_mul_hi_u32 v15, v10, v4
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v15, v3
+; CGP-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc
+; CGP-NEXT: v_mul_lo_u32 v5, v13, v3
+; CGP-NEXT: v_mul_lo_u32 v10, v12, v4
+; CGP-NEXT: v_mul_hi_u32 v11, v12, v3
+; CGP-NEXT: v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v14, v10, v4
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT: v_mul_hi_u32 v12, v11, v4
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12
+; CGP-NEXT: v_mul_lo_u32 v11, v13, v4
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT: v_mul_hi_u32 v10, v12, v4
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v11, v3
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v10
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; CGP-NEXT: v_add_i32_e32 v14, vcc, v3, v5
+; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v3
+; CGP-NEXT: v_mul_hi_u32 v10, v13, v4
; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v14, 0
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v5
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v12, v[4:5]
-; CGP-NEXT: v_sub_i32_e32 v3, vcc, v11, v3
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[4:5]
-; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v10, v4, vcc
-; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v10, v4
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v10, v5
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v15, v[4:5]
+; CGP-NEXT: v_sub_i32_e32 v3, vcc, v12, v3
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[10:11]
+; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v13, v4, vcc
+; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v13, v4
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1
; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
@@ -771,13 +763,13 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
; CGP-NEXT: v_cndmask_b32_e64 v5, v10, v11, s[4:5]
; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v14
-; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc
+; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v15, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; CGP-NEXT: v_cndmask_b32_e32 v1, v15, v2, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v10
; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
@@ -785,8 +777,8 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc
-; CGP-NEXT: v_xor_b32_e32 v3, v13, v0
-; CGP-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; CGP-NEXT: v_xor_b32_e32 v3, v16, v0
+; CGP-NEXT: v_cndmask_b32_e32 v2, v15, v2, vcc
; CGP-NEXT: v_xor_b32_e32 v0, v1, v3
; CGP-NEXT: v_xor_b32_e32 v1, v2, v3
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
@@ -840,126 +832,126 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_xor_b32_e32 v3, v5, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v5, v4
; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
-; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v4
-; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc
+; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v4
+; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc
; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5
; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
-; CGP-NEXT: v_trunc_f32_e32 v7, v6
-; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7
-; CGP-NEXT: v_cvt_u32_f32_e32 v10, v5
-; CGP-NEXT: v_cvt_u32_f32_e32 v13, v7
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7]
-; CGP-NEXT: v_mul_hi_u32 v14, v10, v5
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7]
-; CGP-NEXT: v_mul_lo_u32 v7, v13, v5
-; CGP-NEXT: v_mul_hi_u32 v5, v13, v5
-; CGP-NEXT: v_mul_lo_u32 v15, v10, v6
-; CGP-NEXT: v_mul_lo_u32 v16, v13, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14
-; CGP-NEXT: v_mul_hi_u32 v14, v10, v6
+; CGP-NEXT: v_trunc_f32_e32 v6, v6
+; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6
+; CGP-NEXT: v_cvt_u32_f32_e32 v13, v5
+; CGP-NEXT: v_cvt_u32_f32_e32 v12, v6
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v13, 0
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7]
+; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v13, v[10:11]
+; CGP-NEXT: v_mul_lo_u32 v7, v12, v5
+; CGP-NEXT: v_mul_hi_u32 v10, v13, v5
+; CGP-NEXT: v_mul_lo_u32 v11, v13, v6
+; CGP-NEXT: v_mul_hi_u32 v5, v12, v5
+; CGP-NEXT: v_mul_lo_u32 v16, v12, v6
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10
+; CGP-NEXT: v_mul_hi_u32 v10, v13, v6
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v15, v7
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7
; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_mul_hi_u32 v6, v13, v6
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT: v_mul_hi_u32 v6, v12, v6
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7
; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5
-; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v6, vcc
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7]
-; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v9
-; CGP-NEXT: v_mul_hi_u32 v14, v10, v5
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7]
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v11
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v11, vcc
-; CGP-NEXT: v_xor_b32_e32 v9, v7, v11
-; CGP-NEXT: v_mul_lo_u32 v7, v13, v5
-; CGP-NEXT: v_mul_lo_u32 v12, v10, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v13, v5
-; CGP-NEXT: v_xor_b32_e32 v8, v8, v11
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v5
+; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v6, vcc
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v13, 0
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7]
+; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9
+; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v13, v[10:11]
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v14
+; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v14, vcc
+; CGP-NEXT: v_xor_b32_e32 v11, v7, v14
+; CGP-NEXT: v_mul_lo_u32 v7, v12, v5
+; CGP-NEXT: v_mul_lo_u32 v9, v13, v6
+; CGP-NEXT: v_xor_b32_e32 v15, v8, v14
+; CGP-NEXT: v_mul_hi_u32 v8, v13, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v12, v5
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v14, v13, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7
-; CGP-NEXT: v_mul_hi_u32 v12, v10, v6
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12
-; CGP-NEXT: v_mul_hi_u32 v6, v13, v6
+; CGP-NEXT: v_mul_lo_u32 v8, v12, v6
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; CGP-NEXT: v_mul_hi_u32 v9, v13, v6
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT: v_mul_hi_u32 v6, v12, v6
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc
-; CGP-NEXT: v_mul_lo_u32 v7, v8, v5
-; CGP-NEXT: v_mul_lo_u32 v10, v9, v6
-; CGP-NEXT: v_mul_hi_u32 v12, v9, v5
-; CGP-NEXT: v_mul_hi_u32 v5, v8, v5
-; CGP-NEXT: v_mul_hi_u32 v13, v8, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5
+; CGP-NEXT: v_addc_u32_e32 v6, vcc, v12, v6, vcc
+; CGP-NEXT: v_mul_lo_u32 v7, v15, v5
+; CGP-NEXT: v_mul_lo_u32 v8, v11, v6
+; CGP-NEXT: v_mul_hi_u32 v9, v11, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v15, v5
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v12, v8, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7
-; CGP-NEXT: v_mul_hi_u32 v10, v9, v6
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10
+; CGP-NEXT: v_mul_lo_u32 v9, v15, v6
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT: v_mul_hi_u32 v8, v11, v6
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
; CGP-NEXT: v_add_i32_e32 v12, vcc, v5, v7
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v5
+; CGP-NEXT: v_mul_hi_u32 v8, v15, v6
; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v12, 0
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v7
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[6:7]
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v9, v5
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[6:7]
-; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v8, v6, vcc
-; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v8, v6
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3
-; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v8, v7
+; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v13, v[6:7]
+; CGP-NEXT: v_sub_i32_e32 v5, vcc, v11, v5
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v12, v[7:8]
+; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v15, v9, vcc
+; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v15, v9
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4
; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v4
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3
-; CGP-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v7, v8, v9, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3
+; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v6, v8, v9, s[4:5]
; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v12
-; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
-; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v8
; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
-; CGP-NEXT: v_xor_b32_e32 v5, v11, v2
-; CGP-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc
+; CGP-NEXT: v_xor_b32_e32 v5, v14, v2
+; CGP-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc
; CGP-NEXT: v_xor_b32_e32 v2, v3, v5
; CGP-NEXT: v_xor_b32_e32 v3, v4, v5
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5
@@ -1049,82 +1041,82 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb
; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT: v_mov_b32_e32 v6, 0xffed2705
+; CHECK-NEXT: v_mov_b32_e32 v9, 0xffed2705
; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
-; CHECK-NEXT: v_trunc_f32_e32 v4, v3
-; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3
-; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3
-; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3
-; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT: v_trunc_f32_e32 v3, v3
+; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3
+; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2
+; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v3
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT: v_mul_lo_u32 v5, v8, v6
+; CHECK-NEXT: v_mul_lo_u32 v7, v10, v6
+; CHECK-NEXT: v_mul_hi_u32 v11, v8, v6
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2
-; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6
-; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3
-; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6
-; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT: v_mul_hi_u32 v5, v10, v6
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2
+; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT: v_xor_b32_e32 v4, v0, v9
+; CHECK-NEXT: v_mul_lo_u32 v0, v10, v2
+; CHECK-NEXT: v_mul_lo_u32 v3, v8, v6
+; CHECK-NEXT: v_xor_b32_e32 v5, v1, v9
+; CHECK-NEXT: v_mul_hi_u32 v1, v8, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT: v_mul_lo_u32 v1, v10, v6
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT: v_mul_hi_u32 v3, v8, v6
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v10, v6
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0
; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1
; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0
-; CHECK-NEXT: v_mov_b32_e32 v5, 0x12d8fb
+; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0
+; CHECK-NEXT: v_mov_b32_e32 v6, 0x12d8fb
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1
+; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0
@@ -1133,40 +1125,40 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
; CHECK-NEXT: v_add_i32_e32 v7, vcc, v0, v2
-; CHECK-NEXT: v_mul_hi_u32 v8, v9, v1
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v7, 0
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v2
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v3, v[1:2]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v0
+; CHECK-NEXT: v_mul_hi_u32 v3, v5, v1
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v3, v2
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[1:2]
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0
-; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc
-; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[4:5]
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v7
-; CHECK-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5
+; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc
+; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2
+; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
+; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v1, -1, v3, s[4:5]
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v7
+; CHECK-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v6
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; CHECK-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v4
-; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v3
+; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v4, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; CHECK-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = sdiv i64 %num, 1235195
ret i64 %result
@@ -1186,77 +1178,75 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: s_subb_u32 s6, 0, 0
; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT: v_trunc_f32_e32 v7, v5
-; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7
-; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v7
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0
-; GISEL-NEXT: v_mov_b32_e32 v7, v5
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8]
-; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11]
-; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4
-; GISEL-NEXT: v_mul_hi_u32 v11, v8, v4
-; GISEL-NEXT: v_mul_lo_u32 v7, v8, v13
-; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7
+; GISEL-NEXT: v_trunc_f32_e32 v5, v5
+; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v5
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6]
+; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4
+; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[9:10]
+; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4
+; GISEL-NEXT: v_mul_lo_u32 v9, v7, v13
+; GISEL-NEXT: v_mul_lo_u32 v4, v8, v13
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
-; GISEL-NEXT: v_mul_hi_u32 v14, v8, v13
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v8, v4
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc
-; GISEL-NEXT: v_mov_b32_e32 v4, v14
-; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v7, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v7, v13
-; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v6, v17, v[14:15]
+; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13
+; GISEL-NEXT: v_mul_hi_u32 v18, v4, v13
+; GISEL-NEXT: v_mul_hi_u32 v19, v17, v13
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v4, v[15:16]
; GISEL-NEXT: s_mov_b32 s6, 1
; GISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GISEL-NEXT: v_mul_lo_u32 v15, v16, v14
+; GISEL-NEXT: v_mul_lo_u32 v14, v4, v13
+; GISEL-NEXT: v_mul_hi_u32 v15, v4, v13
; GISEL-NEXT: s_subb_u32 s6, 0, 0
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v15
-; GISEL-NEXT: v_mul_hi_u32 v15, v16, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT: v_mul_hi_u32 v13, v7, v13
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v15, v7, v14
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, v16, v14
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4
-; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GISEL-NEXT: v_xor_b32_e32 v18, v0, v4
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v17
-; GISEL-NEXT: v_mul_hi_u32 v13, v7, v14
-; GISEL-NEXT: v_xor_b32_e32 v19, v1, v4
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v9
+; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13
+; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v19
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v9
+; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v14
+; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v15, v1
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0
; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1
; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0
; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0
-; GISEL-NEXT: v_mov_b32_e32 v7, 0x12d8fb
+; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
@@ -1269,149 +1259,147 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13
-; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v15, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v13
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v7, v16, v[1:2]
+; GISEL-NEXT: v_mul_hi_u32 v1, v19, v1
+; GISEL-NEXT: v_add_i32_e32 v17, vcc, v0, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0
+; GISEL-NEXT: v_add_i32_e32 v20, vcc, v1, v0
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v17, 0
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v20, v[1:2]
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v0
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14]
-; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc
-; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v7
+; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v17, v[13:14]
+; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v15, vcc
+; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v15
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v15
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v7
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v17
+; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v20, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc
-; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1]
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v13
-; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v17, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT: v_mul_lo_u32 v18, v8, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v14, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e32 v19, -1, v0, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6]
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[0:1]
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v16
+; GISEL-NEXT: v_mul_lo_u32 v5, v7, v13
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v18, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
+; GISEL-NEXT: v_cndmask_b32_e32 v14, v16, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v16, v18, v1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v5, v8, v13
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GISEL-NEXT: v_mul_hi_u32 v1, v7, v13
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; GISEL-NEXT: v_mul_hi_u32 v10, v8, v13
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_mul_hi_u32 v10, v8, v0
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v1
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
-; GISEL-NEXT: v_cndmask_b32_e32 v11, v16, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
-; GISEL-NEXT: v_cndmask_b32_e32 v10, v15, v13, vcc
-; GISEL-NEXT: v_xor_b32_e32 v1, v10, v4
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
-; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; GISEL-NEXT: v_xor_b32_e32 v12, v2, v10
-; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0
-; GISEL-NEXT: v_mul_lo_u32 v6, v8, v5
-; GISEL-NEXT: v_xor_b32_e32 v13, v3, v10
-; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v0
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v8, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2]
+; GISEL-NEXT: v_xor_b32_e32 v1, v5, v9
+; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[7:8]
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v20, v16, vcc
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc
+; GISEL-NEXT: v_xor_b32_e32 v14, v2, v13
+; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0
+; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5
+; GISEL-NEXT: v_xor_b32_e32 v15, v3, v13
+; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT: v_mul_hi_u32 v6, v8, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5
+; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0
-; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2
-; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT: v_xor_b32_e32 v8, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0
+; GISEL-NEXT: v_mul_lo_u32 v5, v14, v2
+; GISEL-NEXT: v_mul_hi_u32 v6, v14, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0
+; GISEL-NEXT: v_xor_b32_e32 v7, v12, v9
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2
+; GISEL-NEXT: v_mul_lo_u32 v6, v15, v2
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2
+; GISEL-NEXT: v_mul_hi_u32 v5, v14, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3
-; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v9, 0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v0
-; GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v11, v[0:1]
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v9, v[5:6]
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v3, vcc
-; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v13, v3
-; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4
-; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v9
-; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v11, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7
+; GISEL-NEXT: v_mul_hi_u32 v5, v15, v2
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v5, v0
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[3:4]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v9
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v9, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v7
+; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v15, v7, vcc
+; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3
+; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v10
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v11, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
; GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v6, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v6
+; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v4, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, v2, v13
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v13
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v13
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_sdiv_v2i64_oddk_denom:
@@ -1424,27 +1412,26 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4
; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; CGP-NEXT: v_trunc_f32_e32 v8, v5
-; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8
+; CGP-NEXT: v_trunc_f32_e32 v5, v5
+; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5
; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8
+; CGP-NEXT: v_cvt_u32_f32_e32 v8, v5
; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
-; CGP-NEXT: v_mov_b32_e32 v9, v5
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; CGP-NEXT: v_mul_hi_u32 v11, v7, v4
-; CGP-NEXT: v_mul_hi_u32 v12, v8, v4
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10]
-; CGP-NEXT: v_mul_lo_u32 v10, v8, v4
-; CGP-NEXT: v_mul_lo_u32 v4, v7, v9
-; CGP-NEXT: v_mul_lo_u32 v13, v8, v9
-; CGP-NEXT: v_mul_hi_u32 v14, v7, v9
-; CGP-NEXT: v_mul_hi_u32 v9, v8, v9
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6]
+; CGP-NEXT: v_mul_hi_u32 v11, v8, v4
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v7, v[9:10]
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v4
+; CGP-NEXT: v_mul_hi_u32 v10, v7, v4
+; CGP-NEXT: v_mul_lo_u32 v4, v7, v12
+; CGP-NEXT: v_mul_lo_u32 v13, v8, v12
+; CGP-NEXT: v_mul_hi_u32 v14, v7, v12
+; CGP-NEXT: v_mul_hi_u32 v12, v8, v12
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v11
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
@@ -1452,41 +1439,40 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
-; CGP-NEXT: v_mov_b32_e32 v4, v14
-; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; CGP-NEXT: v_mul_lo_u32 v4, v17, v13
-; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
-; CGP-NEXT: v_mul_lo_u32 v9, v16, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT: v_mul_hi_u32 v9, v16, v13
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_hi_u32 v9, v17, v13
-; CGP-NEXT: v_mul_lo_u32 v13, v17, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v15, v16, v14
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; CGP-NEXT: v_addc_u32_e32 v16, vcc, v8, v12, vcc
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v6, v4, 0
+; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v16, v[13:14]
+; CGP-NEXT: v_mul_lo_u32 v17, v16, v12
+; CGP-NEXT: v_mul_hi_u32 v18, v4, v12
+; CGP-NEXT: v_mul_hi_u32 v19, v16, v12
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v4, v[14:15]
+; CGP-NEXT: v_mul_lo_u32 v13, v4, v12
+; CGP-NEXT: v_mul_hi_u32 v15, v4, v12
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v17, v13
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15
-; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9
-; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9
-; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; CGP-NEXT: v_xor_b32_e32 v18, v0, v9
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v17, v14
-; CGP-NEXT: v_xor_b32_e32 v19, v1, v9
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT: v_mul_lo_u32 v14, v16, v12
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19
+; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15
+; CGP-NEXT: v_mul_hi_u32 v17, v16, v12
+; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc
+; CGP-NEXT: v_xor_b32_e32 v18, v0, v12
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v13
+; CGP-NEXT: v_xor_b32_e32 v19, v1, v12
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v17, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc
; CGP-NEXT: v_mul_lo_u32 v13, v19, v0
; CGP-NEXT: v_mul_lo_u32 v14, v18, v1
; CGP-NEXT: v_mul_hi_u32 v15, v18, v0
@@ -1504,12 +1490,12 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT: v_mul_hi_u32 v1, v19, v1
; CGP-NEXT: v_add_i32_e32 v15, vcc, v0, v13
-; CGP-NEXT: v_mul_hi_u32 v16, v19, v1
+; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v0
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v1, v0
; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v13
; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2]
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v18, v0
; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc
@@ -1519,106 +1505,105 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v1, vcc
-; CGP-NEXT: v_add_i32_e32 v17, vcc, 1, v15
-; CGP-NEXT: v_addc_u32_e32 v18, vcc, 0, v16, vcc
+; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; CGP-NEXT: v_add_i32_e32 v18, vcc, 1, v15
+; CGP-NEXT: v_addc_u32_e32 v19, vcc, 0, v16, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
-; CGP-NEXT: v_mov_b32_e32 v0, v5
-; CGP-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5]
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; CGP-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1]
-; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v19, vcc
-; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v17
-; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v18, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; CGP-NEXT: v_mul_lo_u32 v5, v7, v0
-; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11
-; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v10, v8, v0
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1
-; CGP-NEXT: v_mul_hi_u32 v5, v7, v0
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_mul_hi_u32 v0, v8, v0
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
-; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v17, vcc
-; CGP-NEXT: v_xor_b32_e32 v11, v5, v9
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; CGP-NEXT: v_cndmask_b32_e32 v10, v16, v13, vcc
-; CGP-NEXT: v_xor_b32_e32 v1, v10, v9
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
-; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; CGP-NEXT: v_xor_b32_e32 v12, v2, v10
-; CGP-NEXT: v_mul_lo_u32 v2, v8, v0
-; CGP-NEXT: v_mul_lo_u32 v6, v7, v5
-; CGP-NEXT: v_xor_b32_e32 v13, v3, v10
-; CGP-NEXT: v_mul_hi_u32 v3, v7, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v14, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6]
+; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v18
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[0:1]
+; CGP-NEXT: v_addc_u32_e32 v21, vcc, 0, v19, vcc
+; CGP-NEXT: v_mul_lo_u32 v1, v7, v13
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
+; CGP-NEXT: v_cndmask_b32_e32 v0, v18, v5, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v5, v19, v21, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; CGP-NEXT: v_cndmask_b32_e32 v14, v15, v0, vcc
+; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v9, v1
+; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v0, v10
+; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v13
+; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0
+; CGP-NEXT: v_mul_hi_u32 v1, v7, v13
+; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9
+; CGP-NEXT: v_mul_hi_u32 v10, v8, v13
+; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0
+; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1
+; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1
+; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v7, v0
+; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v1, s[4:5]
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0
+; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc
+; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v3
+; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v10, v[1:2]
+; CGP-NEXT: v_xor_b32_e32 v1, v5, v12
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v9, v[7:8]
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc
+; CGP-NEXT: v_xor_b32_e32 v7, v2, v13
+; CGP-NEXT: v_mul_lo_u32 v2, v10, v0
+; CGP-NEXT: v_mul_lo_u32 v6, v9, v5
+; CGP-NEXT: v_xor_b32_e32 v8, v3, v13
+; CGP-NEXT: v_mul_hi_u32 v3, v9, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v10, v0
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v3, v8, v5
+; CGP-NEXT: v_mul_lo_u32 v3, v10, v5
; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT: v_mul_hi_u32 v6, v7, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v9, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v8, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v10, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0
-; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; CGP-NEXT: v_mul_lo_u32 v5, v13, v3
-; CGP-NEXT: v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT: v_mul_hi_u32 v7, v12, v3
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9
-; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v0
+; CGP-NEXT: v_addc_u32_e32 v2, vcc, v10, v2, vcc
+; CGP-NEXT: v_mul_lo_u32 v5, v8, v3
+; CGP-NEXT: v_mul_lo_u32 v6, v7, v2
+; CGP-NEXT: v_xor_b32_e32 v11, v14, v12
+; CGP-NEXT: v_mul_hi_u32 v9, v7, v3
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v12
+; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v7, v13, v2
-; CGP-NEXT: v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v2
+; CGP-NEXT: v_mul_hi_u32 v3, v8, v3
; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_mul_hi_u32 v6, v12, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v6, v7, v2
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v3, v5
-; CGP-NEXT: v_mul_hi_u32 v8, v13, v2
-; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v5
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4]
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v3, v5
+; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v3
+; CGP-NEXT: v_mul_hi_u32 v6, v8, v2
+; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v6, v5
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v10, v[3:4]
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2
+; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc
+; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5
; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
@@ -1626,24 +1611,24 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3
; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; CGP-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5]
-; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v7
-; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v8, vcc
+; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v9
+; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
; CGP-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v6
-; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc
+; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; CGP-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
-; CGP-NEXT: v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v13
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v13
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = sdiv <2 x i64> %num, <i64 1235195, i64 1235195>
ret <2 x i64> %result
@@ -1679,126 +1664,126 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-NEXT: v_xor_b32_e32 v1, v5, v0
; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v2
; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1
-; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v2
-; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc
+; CHECK-NEXT: v_sub_i32_e32 v13, vcc, 0, v2
+; CHECK-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc
; CHECK-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
; CHECK-NEXT: v_rcp_iflag_f32_e32 v5, v5
; CHECK-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
-; CHECK-NEXT: v_trunc_f32_e32 v7, v6
-; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7
-; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v5
-; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v7
-; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7]
-; CHECK-NEXT: v_mul_hi_u32 v12, v8, v5
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT: v_mul_lo_u32 v7, v11, v5
+; CHECK-NEXT: v_trunc_f32_e32 v6, v6
+; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6
+; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v5
+; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v6
+; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v11, v[6:7]
+; CHECK-NEXT: v_mul_lo_u32 v6, v11, v5
+; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v12, v[7:8]
+; CHECK-NEXT: v_mul_hi_u32 v7, v12, v5
; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5
-; CHECK-NEXT: v_mul_lo_u32 v13, v8, v6
-; CHECK-NEXT: v_mul_lo_u32 v14, v11, v6
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v13
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v12
-; CHECK-NEXT: v_mul_hi_u32 v12, v8, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v13, v7
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v14, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT: v_mul_lo_u32 v8, v12, v9
+; CHECK-NEXT: v_mul_lo_u32 v10, v11, v9
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT: v_mul_hi_u32 v7, v12, v9
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v5
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT: v_mul_hi_u32 v8, v11, v9
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v5
; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v6, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7]
-; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v9
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v9, vcc
-; CHECK-NEXT: v_xor_b32_e32 v7, v3, v9
+; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v11, v[6:7]
+; CHECK-NEXT: v_ashrrev_i32_e32 v13, 31, v4
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v13
+; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v12, v[7:8]
+; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v13, vcc
+; CHECK-NEXT: v_xor_b32_e32 v10, v3, v13
; CHECK-NEXT: v_mul_lo_u32 v3, v11, v5
-; CHECK-NEXT: v_mul_lo_u32 v10, v8, v6
-; CHECK-NEXT: v_xor_b32_e32 v12, v4, v9
-; CHECK-NEXT: v_mul_hi_u32 v4, v8, v5
+; CHECK-NEXT: v_mul_lo_u32 v6, v12, v9
+; CHECK-NEXT: v_xor_b32_e32 v14, v4, v13
+; CHECK-NEXT: v_mul_hi_u32 v4, v12, v5
; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v10
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v4, v11, v6
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v10, v3
-; CHECK-NEXT: v_mul_hi_u32 v10, v8, v6
+; CHECK-NEXT: v_mul_lo_u32 v4, v11, v9
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3
+; CHECK-NEXT: v_mul_hi_u32 v6, v12, v9
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT: v_mul_hi_u32 v6, v11, v9
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v12, v3
; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v11, v4, vcc
-; CHECK-NEXT: v_mul_lo_u32 v5, v12, v3
-; CHECK-NEXT: v_mul_lo_u32 v6, v7, v4
-; CHECK-NEXT: v_mul_hi_u32 v8, v7, v3
-; CHECK-NEXT: v_mul_hi_u32 v3, v12, v3
-; CHECK-NEXT: v_mul_hi_u32 v10, v12, v4
+; CHECK-NEXT: v_mul_lo_u32 v5, v14, v3
+; CHECK-NEXT: v_mul_lo_u32 v6, v10, v4
+; CHECK-NEXT: v_mul_hi_u32 v7, v10, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v14, v3
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v8, v12, v4
+; CHECK-NEXT: v_mul_lo_u32 v7, v14, v4
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_mul_hi_u32 v6, v7, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_mul_hi_u32 v6, v10, v4
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v3, v5
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v8, 0
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v10, v5
-; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v8, v[4:5]
-; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v4, vcc
-; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v4
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v3, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v3
+; CHECK-NEXT: v_mul_hi_u32 v6, v14, v4
+; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v9, 0
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v6, v5
+; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[4:5]
+; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v10, v3
+; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v9, v[5:6]
+; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v7, vcc
+; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v14, v7
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2
; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[4:5]
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, 1, v8
-; CHECK-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1
-; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v9
+; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v11, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v2, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v7
-; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v10, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v6
+; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v7, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v3, v9, v0
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
+; CHECK-NEXT: v_xor_b32_e32 v3, v13, v0
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3
; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
@@ -1839,274 +1824,268 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-LABEL: v_sdiv_v2i64_pow2_shl_denom:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v9, 0x1000
-; GISEL-NEXT: v_mov_b32_e32 v10, 0
-; GISEL-NEXT: v_lshl_b64 v[7:8], v[9:10], v4
-; GISEL-NEXT: v_lshl_b64 v[9:10], v[9:10], v6
+; GISEL-NEXT: v_mov_b32_e32 v12, 0x1000
+; GISEL-NEXT: v_mov_b32_e32 v13, 0
+; GISEL-NEXT: v_lshl_b64 v[7:8], v[12:13], v4
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v8
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v4
; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v8, v4, vcc
; GISEL-NEXT: v_xor_b32_e32 v8, v5, v4
-; GISEL-NEXT: v_xor_b32_e32 v5, v7, v4
-; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v8
-; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5
-; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v8
-; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v5, vcc
-; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v11
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7
-; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v10
-; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7
-; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v7
-; GISEL-NEXT: v_trunc_f32_e32 v13, v11
-; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v13
-; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v7
-; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v13
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
-; GISEL-NEXT: v_mov_b32_e32 v7, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8]
-; GISEL-NEXT: v_mul_lo_u32 v7, v17, v11
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
-; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13
-; GISEL-NEXT: v_mul_hi_u32 v13, v14, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v17, v12
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v18, v7
-; GISEL-NEXT: v_mul_hi_u32 v18, v14, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18
-; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7
+; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4
+; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v8
+; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v7
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v8
+; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v7, vcc
+; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v9
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v5
+; GISEL-NEXT: v_trunc_f32_e32 v9, v9
+; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9
+; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v9
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v5, 0
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v11, v[10:11]
+; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9
+; GISEL-NEXT: v_mul_hi_u32 v19, v5, v9
+; GISEL-NEXT: v_mul_hi_u32 v20, v11, v9
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v18, v5, v[14:15]
+; GISEL-NEXT: v_mul_lo_u32 v10, v5, v9
+; GISEL-NEXT: v_mul_lo_u32 v15, v11, v9
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v17, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v19
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10
+; GISEL-NEXT: v_mul_hi_u32 v14, v5, v9
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v20
+; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15
+; GISEL-NEXT: v_mul_hi_u32 v9, v11, v9
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10
+; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v11, v9, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v5, 0
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v19, v[11:12]
+; GISEL-NEXT: v_mul_lo_u32 v9, v19, v10
+; GISEL-NEXT: v_lshl_b64 v[12:13], v[12:13], v6
+; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v5, v[14:15]
+; GISEL-NEXT: v_mul_hi_u32 v14, v5, v10
+; GISEL-NEXT: v_mul_lo_u32 v11, v5, v16
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v7
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v11, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
-; GISEL-NEXT: v_mov_b32_e32 v7, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8]
-; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; GISEL-NEXT: v_xor_b32_e32 v15, v0, v7
-; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11
-; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12
-; GISEL-NEXT: v_xor_b32_e32 v16, v1, v7
-; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11
-; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14
+; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1
+; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v9
+; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], v1, v9, s[4:5]
+; GISEL-NEXT: v_xor_b32_e32 v14, v0, v9
+; GISEL-NEXT: v_xor_b32_e32 v15, v1, v9
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0
-; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12
+; GISEL-NEXT: v_mul_hi_u32 v1, v19, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v19, v16
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT: v_mul_hi_u32 v11, v5, v16
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v19, v16
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, v16, v0
-; GISEL-NEXT: v_mul_lo_u32 v12, v15, v1
-; GISEL-NEXT: v_mul_hi_u32 v13, v15, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0
-; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v5, v15, v0
+; GISEL-NEXT: v_mul_lo_u32 v10, v14, v1
+; GISEL-NEXT: v_mul_hi_u32 v11, v14, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0
+; GISEL-NEXT: v_xor_b32_e32 v4, v9, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; GISEL-NEXT: v_mul_hi_u32 v10, v14, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v16, v1
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_mul_hi_u32 v12, v15, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v12
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v0, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v1, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v14, v[1:2]
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v6
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v10, v6, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v5, v0, v[12:13]
-; GISEL-NEXT: v_xor_b32_e32 v10, v1, v6
-; GISEL-NEXT: v_xor_b32_e32 v9, v9, v6
-; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v10
-; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v9
-; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v11
-; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v16, v12
-; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v13
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1
-; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v12, vcc
-; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v11, v5, vcc
-; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1
-; GISEL-NEXT: v_trunc_f32_e32 v16, v11
-; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v16
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v0, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v0
+; GISEL-NEXT: v_mul_hi_u32 v10, v15, v1
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v16, 0
+; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3
+; GISEL-NEXT: v_add_i32_e32 v17, vcc, v10, v5
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v17, v[1:2]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v14, v0
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v16, v[10:11]
+; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v15, v5, vcc
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v5
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v7
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v7, vcc
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v16
+; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v17, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v7
+; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v11, v6, v10, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v12, v5
+; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v13, v5, vcc
+; GISEL-NEXT: v_xor_b32_e32 v10, v6, v5
+; GISEL-NEXT: v_xor_b32_e32 v6, v12, v5
+; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v10
+; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v7
+; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v13
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v12
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v18, v0, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v14
+; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v15, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1
+; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
+; GISEL-NEXT: v_trunc_f32_e32 v1, v1
+; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
+; GISEL-NEXT: v_cndmask_b32_e32 v13, v14, v7, vcc
+; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v0
+; GISEL-NEXT: v_sub_i32_e64 v19, s[4:5], 0, v10
; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1
-; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v10
-; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v9, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0
-; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v8
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v8
-; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v8
-; GISEL-NEXT: v_mov_b32_e32 v1, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v5
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13]
-; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v5
-; GISEL-NEXT: v_mul_lo_u32 v8, v18, v12
-; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8
-; GISEL-NEXT: v_mul_hi_u32 v8, v18, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7]
-; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11
-; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9]
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v5
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9]
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v5, v15, v21, s[4:5]
-; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v8
-; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v17, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v21, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7]
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
-; GISEL-NEXT: v_mul_lo_u32 v15, v16, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v15, v11
-; GISEL-NEXT: v_mul_hi_u32 v15, v18, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT: v_mul_hi_u32 v12, v16, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v11
-; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v16, v12, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v19, v13, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v15, v[0:1]
-; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v8, v14, v8, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[0:1]
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v12
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v12, vcc
-; GISEL-NEXT: v_xor_b32_e32 v14, v1, v12
-; GISEL-NEXT: v_mul_lo_u32 v1, v15, v11
-; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0
-; GISEL-NEXT: v_xor_b32_e32 v16, v2, v12
-; GISEL-NEXT: v_mul_hi_u32 v2, v13, v11
-; GISEL-NEXT: v_mul_hi_u32 v4, v15, v11
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], 0, v6, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v14, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v15, v12, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v19, v18, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v21, v18, v0
+; GISEL-NEXT: v_mul_hi_u32 v22, v14, v0
+; GISEL-NEXT: v_mul_hi_u32 v23, v18, v0
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v14, v[7:8]
+; GISEL-NEXT: v_cndmask_b32_e32 v11, v16, v13, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v15, v17, v12, vcc
+; GISEL-NEXT: v_mul_lo_u32 v1, v14, v0
+; GISEL-NEXT: v_mul_lo_u32 v8, v18, v0
+; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v21, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v22
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v7, v1
+; GISEL-NEXT: v_mul_hi_u32 v7, v14, v0
+; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v23
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8
+; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0
+; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v7, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7
+; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v7
+; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v14, v1
+; GISEL-NEXT: v_addc_u32_e64 v14, s[4:5], v18, v0, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v13, 0
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v19, v14, v[1:2]
+; GISEL-NEXT: v_xor_b32_e32 v1, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v20, v13, v[7:8]
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT: v_xor_b32_e32 v12, v2, v9
+; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0
+; GISEL-NEXT: v_mul_lo_u32 v7, v13, v11
+; GISEL-NEXT: v_xor_b32_e32 v16, v3, v9
; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v15, v0, vcc
-; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1
-; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0
-; GISEL-NEXT: v_mul_hi_u32 v4, v14, v1
-; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1
-; GISEL-NEXT: v_xor_b32_e32 v5, v5, v7
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; GISEL-NEXT: v_mul_lo_u32 v3, v14, v11
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2
+; GISEL-NEXT: v_mul_hi_u32 v7, v13, v11
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v14, v11
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT: v_mul_hi_u32 v3, v14, v0
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v3
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v1, v2
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v3, v16, v0
+; GISEL-NEXT: v_mul_lo_u32 v7, v12, v2
+; GISEL-NEXT: v_mul_hi_u32 v8, v12, v0
; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v11, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1
-; GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v13, v[0:1]
-; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v5, v7
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4]
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2
+; GISEL-NEXT: v_xor_b32_e32 v11, v15, v4
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v8, v16, v2
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3
+; GISEL-NEXT: v_mul_hi_u32 v7, v12, v2
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0
+; GISEL-NEXT: v_mul_hi_u32 v7, v16, v2
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v7, v0
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v14, v[3:4]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v11, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v13, v[7:8]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc
; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v9
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v10
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6
; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v13
+; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v14, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5
-; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v7, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v7
+; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
-; GISEL-NEXT: v_xor_b32_e32 v4, v12, v6
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v13, v2, vcc
+; GISEL-NEXT: v_xor_b32_e32 v4, v9, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc
; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4
; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
@@ -2138,126 +2117,126 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_xor_b32_e32 v1, v10, v0
; CGP-NEXT: v_cvt_f32_u32_e32 v10, v4
; CGP-NEXT: v_cvt_f32_u32_e32 v11, v1
-; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v4
-; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v1, vcc
+; CGP-NEXT: v_sub_i32_e32 v18, vcc, 0, v4
+; CGP-NEXT: v_subb_u32_e32 v19, vcc, 0, v1, vcc
; CGP-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11
; CGP-NEXT: v_rcp_iflag_f32_e32 v10, v10
; CGP-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10
; CGP-NEXT: v_mul_f32_e32 v11, 0x2f800000, v10
-; CGP-NEXT: v_trunc_f32_e32 v12, v11
-; CGP-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12
-; CGP-NEXT: v_cvt_u32_f32_e32 v13, v10
-; CGP-NEXT: v_cvt_u32_f32_e32 v16, v12
-; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12]
-; CGP-NEXT: v_mul_hi_u32 v17, v13, v10
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT: v_mul_lo_u32 v12, v16, v10
+; CGP-NEXT: v_trunc_f32_e32 v11, v11
+; CGP-NEXT: v_mac_f32_e32 v10, 0xcf800000, v11
+; CGP-NEXT: v_cvt_u32_f32_e32 v17, v10
+; CGP-NEXT: v_cvt_u32_f32_e32 v16, v11
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v18, v17, 0
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v18, v16, v[11:12]
+; CGP-NEXT: v_mul_lo_u32 v11, v16, v10
+; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v19, v17, v[12:13]
+; CGP-NEXT: v_mul_hi_u32 v12, v17, v10
; CGP-NEXT: v_mul_hi_u32 v10, v16, v10
-; CGP-NEXT: v_mul_lo_u32 v18, v13, v11
-; CGP-NEXT: v_mul_lo_u32 v19, v16, v11
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v18
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v17
-; CGP-NEXT: v_mul_hi_u32 v17, v13, v11
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v18, v12
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v19, v10
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v17
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17
-; CGP-NEXT: v_mul_hi_u32 v11, v16, v11
+; CGP-NEXT: v_mul_lo_u32 v13, v17, v14
+; CGP-NEXT: v_mul_lo_u32 v15, v16, v14
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT: v_mul_hi_u32 v12, v17, v14
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12
; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v10
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT: v_mul_hi_u32 v13, v16, v14
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v10
; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v11, vcc
-; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12]
-; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc
-; CGP-NEXT: v_xor_b32_e32 v12, v8, v14
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v18, v17, 0
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v18, v16, v[11:12]
+; CGP-NEXT: v_ashrrev_i32_e32 v18, 31, v9
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v18
+; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v19, v17, v[12:13]
+; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v18, vcc
+; CGP-NEXT: v_xor_b32_e32 v15, v8, v18
; CGP-NEXT: v_mul_lo_u32 v8, v16, v10
-; CGP-NEXT: v_mul_lo_u32 v15, v13, v11
-; CGP-NEXT: v_xor_b32_e32 v17, v9, v14
-; CGP-NEXT: v_mul_hi_u32 v9, v13, v10
+; CGP-NEXT: v_mul_lo_u32 v11, v17, v14
+; CGP-NEXT: v_xor_b32_e32 v19, v9, v18
+; CGP-NEXT: v_mul_hi_u32 v9, v17, v10
; CGP-NEXT: v_mul_hi_u32 v10, v16, v10
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v9, v16, v11
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8
-; CGP-NEXT: v_mul_hi_u32 v15, v13, v11
+; CGP-NEXT: v_mul_lo_u32 v9, v16, v14
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8
+; CGP-NEXT: v_mul_hi_u32 v11, v17, v14
; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v15
-; CGP-NEXT: v_mul_hi_u32 v11, v16, v11
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT: v_mul_hi_u32 v11, v16, v14
; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v17, v8
; CGP-NEXT: v_addc_u32_e32 v9, vcc, v16, v9, vcc
-; CGP-NEXT: v_mul_lo_u32 v10, v17, v8
-; CGP-NEXT: v_mul_lo_u32 v11, v12, v9
-; CGP-NEXT: v_mul_hi_u32 v13, v12, v8
-; CGP-NEXT: v_mul_hi_u32 v8, v17, v8
-; CGP-NEXT: v_mul_hi_u32 v15, v17, v9
+; CGP-NEXT: v_mul_lo_u32 v10, v19, v8
+; CGP-NEXT: v_mul_lo_u32 v11, v15, v9
+; CGP-NEXT: v_mul_hi_u32 v12, v15, v8
+; CGP-NEXT: v_mul_hi_u32 v8, v19, v8
; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v17, v9
+; CGP-NEXT: v_mul_lo_u32 v12, v19, v9
; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_mul_hi_u32 v11, v12, v9
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v11, v15, v9
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11
; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v8, v10
-; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v13, 0
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v15, v10
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v11, v[9:10]
-; CGP-NEXT: v_sub_i32_e32 v8, vcc, v12, v8
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v1, v13, v[9:10]
-; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v17, v9, vcc
-; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v17, v9
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1
-; CGP-NEXT: v_subb_u32_e32 v9, vcc, v9, v1, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v8, v10
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v8
+; CGP-NEXT: v_mul_hi_u32 v11, v19, v9
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v14, 0
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v11, v10
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v4, v16, v[9:10]
+; CGP-NEXT: v_sub_i32_e32 v8, vcc, v15, v8
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v1, v14, v[10:11]
+; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v19, v12, vcc
+; CGP-NEXT: v_sub_i32_e64 v10, s[4:5], v19, v12
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1
+; CGP-NEXT: v_subb_u32_e32 v10, vcc, v10, v1, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v4
; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v4
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v1
-; CGP-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v10, v12, v15, s[4:5]
-; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v13
-; CGP-NEXT: v_addc_u32_e32 v15, vcc, 0, v11, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v1
+; CGP-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v10, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v9, v11, v12, s[4:5]
+; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v14
+; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v16, vcc
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v10, v1
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v1
-; CGP-NEXT: v_cndmask_b32_e32 v1, v16, v4, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v12
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v15, vcc
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v10, v1
+; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v4, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v11
+; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v12, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v4, v15, v8, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc
-; CGP-NEXT: v_xor_b32_e32 v8, v14, v0
-; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v4, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v4, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v4, v12, v8, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
+; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc
+; CGP-NEXT: v_xor_b32_e32 v8, v18, v0
+; CGP-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc
; CGP-NEXT: v_xor_b32_e32 v0, v1, v8
; CGP-NEXT: v_xor_b32_e32 v1, v4, v8
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
@@ -2313,128 +2292,126 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_xor_b32_e32 v3, v6, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v6, v4
; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3
-; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v4
-; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc
+; CGP-NEXT: v_sub_i32_e32 v15, vcc, 0, v4
+; CGP-NEXT: v_subb_u32_e32 v16, vcc, 0, v3, vcc
; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v8
; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6
; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6
; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6
-; CGP-NEXT: v_trunc_f32_e32 v10, v8
-; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v10
-; CGP-NEXT: v_cvt_u32_f32_e32 v11, v6
-; CGP-NEXT: v_cvt_u32_f32_e32 v14, v10
-; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
-; CGP-NEXT: v_mov_b32_e32 v6, v9
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7]
-; CGP-NEXT: v_mul_lo_u32 v6, v14, v8
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT: v_mul_hi_u32 v10, v11, v8
+; CGP-NEXT: v_trunc_f32_e32 v8, v8
+; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8
+; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
+; CGP-NEXT: v_cvt_u32_f32_e32 v14, v8
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v15, v6, 0
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v14, v[9:10]
+; CGP-NEXT: v_mul_lo_u32 v9, v14, v8
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v6, v[10:11]
+; CGP-NEXT: v_mul_hi_u32 v10, v6, v8
; CGP-NEXT: v_mul_hi_u32 v8, v14, v8
-; CGP-NEXT: v_mul_lo_u32 v15, v11, v9
-; CGP-NEXT: v_mul_lo_u32 v16, v14, v9
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v11, v9
-; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v15, v6
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v16, v8
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v11, v6, v12
+; CGP-NEXT: v_mul_lo_u32 v13, v14, v12
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; CGP-NEXT: v_mul_hi_u32 v10, v6, v12
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10
-; CGP-NEXT: v_mul_hi_u32 v9, v14, v9
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v6
-; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v8, vcc
-; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
-; CGP-NEXT: v_mov_b32_e32 v6, v9
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7]
-; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v12, vcc
-; CGP-NEXT: v_xor_b32_e32 v10, v5, v12
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT: v_mul_hi_u32 v11, v14, v12
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v9, vcc
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v15, v6, 0
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v14, v[9:10]
+; CGP-NEXT: v_ashrrev_i32_e32 v15, 31, v7
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v15
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v6, v[10:11]
+; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v15, vcc
+; CGP-NEXT: v_xor_b32_e32 v11, v5, v15
; CGP-NEXT: v_mul_lo_u32 v5, v14, v8
-; CGP-NEXT: v_mul_lo_u32 v7, v11, v9
-; CGP-NEXT: v_xor_b32_e32 v13, v6, v12
-; CGP-NEXT: v_mul_hi_u32 v6, v11, v8
+; CGP-NEXT: v_mul_lo_u32 v9, v6, v12
+; CGP-NEXT: v_xor_b32_e32 v13, v7, v15
+; CGP-NEXT: v_mul_hi_u32 v7, v6, v8
; CGP-NEXT: v_mul_hi_u32 v8, v14, v8
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v6, v14, v9
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT: v_mul_hi_u32 v7, v11, v9
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; CGP-NEXT: v_mul_lo_u32 v7, v14, v12
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5
+; CGP-NEXT: v_mul_hi_u32 v9, v6, v12
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT: v_mul_hi_u32 v9, v14, v12
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT: v_mul_hi_u32 v8, v14, v9
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7
; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5
-; CGP-NEXT: v_addc_u32_e32 v6, vcc, v14, v6, vcc
+; CGP-NEXT: v_addc_u32_e32 v6, vcc, v14, v7, vcc
; CGP-NEXT: v_mul_lo_u32 v7, v13, v5
-; CGP-NEXT: v_mul_lo_u32 v8, v10, v6
-; CGP-NEXT: v_mul_hi_u32 v9, v10, v5
+; CGP-NEXT: v_mul_lo_u32 v8, v11, v6
+; CGP-NEXT: v_mul_hi_u32 v9, v11, v5
; CGP-NEXT: v_mul_hi_u32 v5, v13, v5
-; CGP-NEXT: v_mul_hi_u32 v11, v13, v6
; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_mul_lo_u32 v9, v13, v6
; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT: v_mul_hi_u32 v8, v10, v6
+; CGP-NEXT: v_mul_hi_u32 v8, v11, v6
; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v5, v7
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, 0
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v7
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v8, v[6:7]
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v9, v[6:7]
-; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v13, v6, vcc
-; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v13, v6
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3
-; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v5, v7
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v5
+; CGP-NEXT: v_mul_hi_u32 v8, v13, v6
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v12, 0
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v8, v7
+; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v14, v[6:7]
+; CGP-NEXT: v_sub_i32_e32 v5, vcc, v11, v5
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v12, v[7:8]
+; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v13, v9, vcc
+; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v13, v9
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4
; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v4
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3
-; CGP-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[4:5]
-; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v9
-; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3
+; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v6, v8, v9, s[4:5]
+; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v12
+; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v14, vcc
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
-; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v10
-; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v11, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v5, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
-; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
-; CGP-NEXT: v_xor_b32_e32 v5, v12, v2
-; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v8
+; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
+; CGP-NEXT: v_xor_b32_e32 v5, v15, v2
+; CGP-NEXT: v_cndmask_b32_e32 v4, v14, v4, vcc
; CGP-NEXT: v_xor_b32_e32 v2, v3, v5
; CGP-NEXT: v_xor_b32_e32 v3, v4, v5
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5
@@ -2504,15 +2481,15 @@ define i64 @v_sdiv_i64_24bit(i64 %num, i64 %den) {
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3
-; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0
; CGP-NEXT: v_rcp_f32_e32 v1, v1
; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v1
; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v3
-; CGP-NEXT: v_mul_lo_u32 v1, v1, v4
-; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v1, 0
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v2
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
+; CGP-NEXT: v_mul_lo_u32 v5, v1, v4
+; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v5, 0
+; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
; CGP-NEXT: v_mul_lo_u32 v0, v1, v3
; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v1
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v5, v0
@@ -2537,198 +2514,194 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4
; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1
-; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v9, 0
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v1
-; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v9
+; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v10, 0
+; GISEL-NEXT: v_sub_i32_e32 v12, vcc, 0, v1
+; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v10
; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_subb_u32_e64 v13, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3
-; GISEL-NEXT: v_trunc_f32_e32 v5, v4
-; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5
-; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v3
-; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5]
-; GISEL-NEXT: v_mul_hi_u32 v12, v7, v3
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3
-; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3
-; GISEL-NEXT: v_mul_lo_u32 v13, v7, v4
-; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; GISEL-NEXT: v_mul_hi_u32 v12, v7, v4
+; GISEL-NEXT: v_trunc_f32_e32 v4, v4
+; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v3
+; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v4
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v9, v[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v11, v[7:8]
+; GISEL-NEXT: v_mul_lo_u32 v5, v9, v3
+; GISEL-NEXT: v_mul_hi_u32 v7, v11, v3
+; GISEL-NEXT: v_mul_lo_u32 v8, v11, v4
+; GISEL-NEXT: v_mul_hi_u32 v3, v9, v3
+; GISEL-NEXT: v_mul_lo_u32 v14, v9, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v11, v4
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v3
-; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3
-; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0
-; GISEL-NEXT: v_mul_lo_u32 v8, v7, v4
-; GISEL-NEXT: v_mul_hi_u32 v0, v7, v3
-; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v3
+; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v9, v[4:5]
+; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v11, v3
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v11, v[7:8]
+; GISEL-NEXT: v_mul_lo_u32 v5, v9, v3
+; GISEL-NEXT: v_mul_hi_u32 v3, v9, v3
+; GISEL-NEXT: v_mul_lo_u32 v7, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v11, v4
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_mul_hi_u32 v8, v7, v4
+; GISEL-NEXT: v_mul_lo_u32 v5, v9, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0
+; GISEL-NEXT: v_mul_hi_u32 v7, v11, v4
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v9, v3, vcc
; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0
-; GISEL-NEXT: v_mul_lo_u32 v5, v10, v3
-; GISEL-NEXT: v_mul_hi_u32 v7, v10, v0
+; GISEL-NEXT: v_mul_lo_u32 v5, v12, v3
+; GISEL-NEXT: v_mul_hi_u32 v7, v12, v0
; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0
-; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v2
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; GISEL-NEXT: v_mul_lo_u32 v5, 0, v3
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3
+; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0
; GISEL-NEXT: v_mul_hi_u32 v4, 0, v3
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0
; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6
-; GISEL-NEXT: v_mov_b32_e32 v5, v8
-; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v3
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v4, v[5:6]
-; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v8
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v0, v[5:6]
-; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v10, v7
-; GISEL-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v2
-; GISEL-NEXT: v_trunc_f32_e32 v8, v6
-; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v8
-; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v2
-; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v3
-; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
-; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v8
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5
-; GISEL-NEXT: v_mov_b32_e32 v2, v7
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v15, v[2:3]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v14, v12, v[7:8]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v8, -1, v2, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v2, v15, v6
-; GISEL-NEXT: v_mul_lo_u32 v10, v12, v7
-; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v5, vcc
-; GISEL-NEXT: v_mul_hi_u32 v5, v12, v6
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10
+; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v3
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v4, v[8:9]
+; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v10
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v0, v[5:6]
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v11
+; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v12, v7
+; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, v8, vcc
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
+; GISEL-NEXT: v_trunc_f32_e32 v6, v6
+; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6
+; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5
+; GISEL-NEXT: v_sub_i32_e64 v15, s[4:5], 0, v3
+; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v6
+; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v13, 0
+; GISEL-NEXT: v_sub_i32_e64 v17, s[4:5], 0, v8
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v15, v14, v[6:7]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[7:8]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, -1, v6, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v6, v14, v5
+; GISEL-NEXT: v_mul_lo_u32 v7, v13, v9
+; GISEL-NEXT: v_mul_hi_u32 v10, v13, v5
+; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v17, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v10, v14, v9
+; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT: v_mul_hi_u32 v7, v13, v9
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v15, v7
-; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2
-; GISEL-NEXT: v_mul_hi_u32 v10, v12, v7
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7
+; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT: v_mul_hi_u32 v7, v15, v7
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v2
-; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v15, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v10, 0
-; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v9, v1
-; GISEL-NEXT: v_mov_b32_e32 v2, v6
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, v[2:3]
-; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v16, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v10, v[6:7]
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v0
-; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v4, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1
-; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5
-; GISEL-NEXT: v_mul_lo_u32 v9, v10, v6
-; GISEL-NEXT: v_mul_hi_u32 v14, v10, v5
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v5
+; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v14, v6, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v13, 0
+; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v11, v1
+; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v8, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v15, v14, v[6:7]
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v0
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[7:8]
+; GISEL-NEXT: v_mul_lo_u32 v6, v14, v5
+; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v4, vcc
+; GISEL-NEXT: v_mul_lo_u32 v7, v13, v9
+; GISEL-NEXT: v_mul_hi_u32 v8, v13, v5
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v1
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v14, v12, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
-; GISEL-NEXT: v_mul_hi_u32 v9, v10, v6
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v14, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9
-; GISEL-NEXT: v_mul_hi_u32 v6, v12, v6
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v8, v14, v9
+; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT: v_mul_hi_u32 v7, v13, v9
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v12, v6, vcc
-; GISEL-NEXT: v_mul_lo_u32 v6, 0, v5
-; GISEL-NEXT: v_mul_lo_u32 v9, v11, v7
-; GISEL-NEXT: v_mul_hi_u32 v14, v11, v5
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v2
-; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v13, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT: v_mul_lo_u32 v9, 0, v7
-; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14
-; GISEL-NEXT: v_mul_hi_u32 v14, v11, v7
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT: v_mul_hi_u32 v8, v14, v9
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v5, v6
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, 0
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
+; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v14, v6, vcc
+; GISEL-NEXT: v_mul_lo_u32 v7, 0, v5
+; GISEL-NEXT: v_mul_lo_u32 v8, v2, v6
+; GISEL-NEXT: v_mul_hi_u32 v11, v2, v5
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v15
+; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v18, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT: v_mul_lo_u32 v8, 0, v6
+; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v2, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v5, v7
+; GISEL-NEXT: v_mul_hi_u32 v13, 0, v6
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v11, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
-; GISEL-NEXT: v_mul_hi_u32 v10, 0, v7
-; GISEL-NEXT: v_mov_b32_e32 v1, v6
-; GISEL-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v10, v[1:2]
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v12, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v9, v[6:7]
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v5
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v6
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v9, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v3, v13, v[6:7]
+; GISEL-NEXT: v_cndmask_b32_e32 v9, v18, v10, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], 0, v11, v[7:8]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v9
+; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v9, vcc
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v3
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
@@ -2736,8 +2709,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v9
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v11
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
@@ -2748,8 +2721,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_sdiv_v2i64_24bit:
@@ -2757,47 +2730,47 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v4
; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3
-; CGP-NEXT: v_and_b32_e32 v4, 0xffffff, v6
-; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v3
+; CGP-NEXT: v_and_b32_e32 v4, 0xffffff, v0
+; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v6
+; CGP-NEXT: v_and_b32_e32 v9, 0xffffff, v2
; CGP-NEXT: v_rcp_f32_e32 v1, v1
-; CGP-NEXT: v_and_b32_e32 v8, 0xffffff, v0
-; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; CGP-NEXT: v_cvt_u32_f32_e32 v5, v1
-; CGP-NEXT: v_cvt_f32_u32_e32 v1, v4
-; CGP-NEXT: v_mul_lo_u32 v6, v6, v5
-; CGP-NEXT: v_rcp_f32_e32 v7, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0
-; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v7
+; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v1
; CGP-NEXT: v_cvt_u32_f32_e32 v6, v0
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v4
-; CGP-NEXT: v_mul_lo_u32 v5, v1, v3
-; CGP-NEXT: v_mul_lo_u32 v0, v0, v6
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v3
+; CGP-NEXT: v_cvt_f32_u32_e32 v0, v5
+; CGP-NEXT: v_mul_lo_u32 v7, v1, v6
+; CGP-NEXT: v_rcp_f32_e32 v8, v0
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
+; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v8
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v1
+; CGP-NEXT: v_cvt_u32_f32_e32 v2, v0
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v6, 0
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v5
+; CGP-NEXT: v_mul_lo_u32 v6, v0, v2
+; CGP-NEXT: v_mul_lo_u32 v0, v1, v3
; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v1
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, v4, v0
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v3
; CGP-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0
-; CGP-NEXT: v_and_b32_e32 v8, 0xffffff, v2
-; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v5, v3
-; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v6, v1
-; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v1, 0
-; CGP-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v6, 0
+; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v4, v3
+; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v2, v1
+; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, 0
+; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v7
-; CGP-NEXT: v_mul_lo_u32 v5, v2, v4
+; CGP-NEXT: v_mul_lo_u32 v4, v2, v5
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3
; CGP-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc
; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; CGP-NEXT: v_sub_i32_e32 v3, vcc, v8, v5
-; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4
-; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
-; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v4
-; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4
-; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
+; CGP-NEXT: v_sub_i32_e32 v3, vcc, v9, v4
+; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v5
+; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v5
+; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v5
+; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; CGP-NEXT: s_setpc_b64 s[30:31]
%num.mask = and <2 x i64> %num, <i64 16777215, i64 16777215>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 1441591..9d6ffc9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -172,68 +172,68 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: s_subb_u32 s15, 0, s9
; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX8-NEXT: v_trunc_f32_e32 v2, v1
-; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
-; GFX8-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT: v_trunc_f32_e32 v1, v1
+; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1
+; GFX8-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v1
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v6, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s15, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v6, 0
+; GFX8-NEXT: v_mov_b32_e32 v8, s11
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s15, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; GFX8-NEXT: v_mul_lo_u32 v2, s11, v0
; GFX8-NEXT: v_mul_lo_u32 v3, s10, v1
; GFX8-NEXT: v_mul_hi_u32 v4, s10, v0
; GFX8-NEXT: v_mul_hi_u32 v0, s11, v0
-; GFX8-NEXT: v_mul_hi_u32 v5, s11, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
@@ -246,36 +246,36 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v6, s11
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0
+; GFX8-NEXT: v_mul_hi_u32 v3, s11, v1
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v3, v2
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2]
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s10, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v5, s9
-; GFX8-NEXT: v_subb_u32_e64 v2, s[0:1], v6, v1, vcc
-; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s11, v1
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: v_subb_u32_e64 v2, s[0:1], v8, v4, vcc
+; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s11, v4
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v2
-; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[0:1]
-; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s8, v0
-; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v4
-; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
+; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1]
+; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v0
+; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v3, vcc
+; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v6
+; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1]
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v5
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8
-; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v7
+; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v5
; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1]
; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9
; GFX8-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
@@ -283,20 +283,20 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v5, v0, v5, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v9, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v7, v10, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v0, v3, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1]
; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[12:13]
; GFX8-NEXT: v_xor_b32_e32 v0, s0, v4
-; GFX8-NEXT: v_xor_b32_e32 v1, s1, v3
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_xor_b32_e32 v1, s1, v6
+; GFX8-NEXT: v_mov_b32_e32 v4, s1
; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; GFX8-NEXT: v_xor_b32_e32 v3, s2, v5
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3
; GFX8-NEXT: v_xor_b32_e32 v4, s2, v2
; GFX8-NEXT: v_mov_b32_e32 v5, s2
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s2, v3
@@ -312,6 +312,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-LABEL: sdivrem_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s2, s17, 31
; GFX9-NEXT: s_ashr_i32 s4, s19, 31
@@ -332,67 +333,66 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: s_subb_u32 s11, 0, s7
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX9-NEXT: v_trunc_f32_e32 v2, v1
-; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
-; GFX9-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, s7
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
+; GFX9-NEXT: v_trunc_f32_e32 v1, v1
+; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1
+; GFX9-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v1
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add_u32_e32 v1, v3, v1
+; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
; GFX9-NEXT: v_mul_lo_u32 v2, s9, v0
; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1
; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0
; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0
-; GFX9-NEXT: v_mul_hi_u32 v6, s9, v1
+; GFX9-NEXT: v_mul_hi_u32 v5, s9, v1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
@@ -400,67 +400,67 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_mul_lo_u32 v4, s9, v1
; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
; GFX9-NEXT: v_mul_hi_u32 v3, s8, v1
+; GFX9-NEXT: v_mov_b32_e32 v6, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v5, 0
+; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v0, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v7, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v6, s9
+; GFX9-NEXT: v_add3_u32 v8, v3, v2, v5
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s6, v8, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s8, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v5, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v6, v1, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v2
-; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s7, v7, v[2:3]
+; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v1, v4, vcc
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v1
+; GFX9-NEXT: v_sub_u32_e32 v2, s9, v4
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[0:1]
-; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s6, v0
-; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v1, vcc
-; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5
-; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v9
+; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v1
+; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v0
+; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v2, vcc
+; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v7
+; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v8, s[0:1]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v5
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v8
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v4
+; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v9
-; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v8
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v5
+; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v4
; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1]
; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10
-; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc
; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1]
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v10, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v11, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v4, v0, v4, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1]
; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5]
-; GFX9-NEXT: v_xor_b32_e32 v0, s0, v5
-; GFX9-NEXT: v_xor_b32_e32 v1, s1, v3
+; GFX9-NEXT: v_xor_b32_e32 v0, s0, v3
+; GFX9-NEXT: v_xor_b32_e32 v1, s1, v7
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT: v_xor_b32_e32 v3, s2, v6
-; GFX9-NEXT: v_xor_b32_e32 v5, s2, v2
-; GFX9-NEXT: v_mov_b32_e32 v6, s2
+; GFX9-NEXT: v_xor_b32_e32 v3, s2, v4
+; GFX9-NEXT: v_xor_b32_e32 v4, s2, v2
+; GFX9-NEXT: v_mov_b32_e32 v5, s2
; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v3
-; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13]
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[14:15]
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v5, vcc
+; GFX9-NEXT: global_store_dwordx2 v9, v[0:1], s[12:13]
+; GFX9-NEXT: global_store_dwordx2 v9, v[2:3], s[14:15]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sdivrem_i64:
@@ -554,29 +554,29 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s8
; GFX10-NEXT: v_add_co_u32 v0, s8, v5, v0
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s8
+; GFX10-NEXT: v_mul_hi_u32 v5, s1, v1
; GFX10-NEXT: v_add_nc_u32_e32 v2, v6, v2
; GFX10-NEXT: v_add_co_u32 v0, s8, v0, v3
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s8
-; GFX10-NEXT: v_add_co_u32 v5, s8, v0, v2
-; GFX10-NEXT: v_mul_hi_u32 v2, s1, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s8
-; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s8, s6, v5, 0
-; GFX10-NEXT: v_add3_u32 v3, v3, v6, v2
+; GFX10-NEXT: v_add_co_u32 v6, s8, v0, v2
+; GFX10-NEXT: v_add_nc_u32_e32 v2, v4, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s8
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s8, s6, v6, 0
+; GFX10-NEXT: v_add3_u32 v3, v2, v3, v5
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s8, s6, v3, v[1:2]
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s8, s7, v5, v[1:2]
-; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v5, 1
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s8, s7, v6, v[1:2]
+; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v6, 1
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
-; GFX10-NEXT: v_sub_nc_u32_e32 v6, s1, v1
+; GFX10-NEXT: v_sub_nc_u32_e32 v5, s1, v1
; GFX10-NEXT: v_sub_co_ci_u32_e64 v1, s0, s1, v1, vcc_lo
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s7, v5, vcc_lo
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v0
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, s6
-; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v6, vcc_lo
+; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v5, vcc_lo
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v1
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s7, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v8
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0
@@ -590,16 +590,16 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11
; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v7, s0
; GFX10-NEXT: v_sub_co_u32 v10, s0, v8, s6
-; GFX10-NEXT: v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0
+; GFX10-NEXT: v_subrev_co_ci_u32_e64 v5, s0, 0, v5, s0
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v13, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v7
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v7, v8, v10, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v2, s0
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_xor_b32_e32 v2, s4, v2
; GFX10-NEXT: v_xor_b32_e32 v3, s5, v3
@@ -1308,71 +1308,71 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: s_subb_u32 s17, 0, s9
; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX8-NEXT: v_trunc_f32_e32 v2, v1
-; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
-; GFX8-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT: v_trunc_f32_e32 v1, v1
+; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1
+; GFX8-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v1
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX8-NEXT: v_mov_b32_e32 v8, s11
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4
; GFX8-NEXT: s_xor_b64 s[16:17], s[4:5], s[6:7]
; GFX8-NEXT: s_ashr_i32 s6, s19, 31
; GFX8-NEXT: s_mov_b32 s7, s6
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; GFX8-NEXT: v_mul_lo_u32 v2, s11, v0
; GFX8-NEXT: v_mul_lo_u32 v3, s10, v1
; GFX8-NEXT: v_mul_hi_u32 v4, s10, v0
; GFX8-NEXT: v_mul_hi_u32 v0, s11, v0
-; GFX8-NEXT: v_mul_hi_u32 v5, s11, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
@@ -1385,207 +1385,206 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v6, s11
-; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s10, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v5, s9
-; GFX8-NEXT: s_ashr_i32 s10, s3, 31
-; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s11, v1
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0
+; GFX8-NEXT: v_mul_hi_u32 v3, s11, v1
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v3, v2
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2]
+; GFX8-NEXT: v_sub_u32_e32 v9, vcc, s10, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v4, vcc
+; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s11, v4
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6
-; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s8, v7
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1]
-; GFX8-NEXT: v_subbrev_u32_e64 v9, s[0:1], 0, v0, vcc
-; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], 1, v4
-; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v9
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8
+; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v9
+; GFX8-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v0, vcc
+; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], 1, v6
+; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1]
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v3
+; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v9
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4
+; GFX8-NEXT: v_subrev_u32_e32 v14, vcc, s8, v3
+; GFX8-NEXT: s_ashr_i32 s8, s3, 31
; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1]
-; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v1
+; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v5
+; GFX8-NEXT: s_add_u32 s10, s18, s6
; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
-; GFX8-NEXT: s_add_u32 s0, s18, s6
-; GFX8-NEXT: s_addc_u32 s1, s19, s6
-; GFX8-NEXT: s_add_u32 s2, s2, s10
-; GFX8-NEXT: s_mov_b32 s11, s10
-; GFX8-NEXT: s_addc_u32 s3, s3, s10
-; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11]
-; GFX8-NEXT: v_cvt_f32_u32_e32 v14, s3
-; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
-; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s2
-; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s8, v8
-; GFX8-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v0, vcc
-; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v14
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX8-NEXT: s_addc_u32 s11, s19, s6
+; GFX8-NEXT: s_add_u32 s0, s2, s8
+; GFX8-NEXT: s_mov_b32 s9, s8
+; GFX8-NEXT: s_addc_u32 s1, s3, s8
+; GFX8-NEXT: s_xor_b64 s[2:3], s[0:1], s[8:9]
+; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s3
+; GFX8-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v0, vcc
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2
+; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1
+; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v1, v12, vcc
-; GFX8-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7]
-; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX8-NEXT: v_trunc_f32_e32 v11, v1
-; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v11
; GFX8-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v0
+; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX8-NEXT: s_xor_b64 s[10:11], s[10:11], s[6:7]
; GFX8-NEXT: s_sub_u32 s5, 0, s2
-; GFX8-NEXT: s_subb_u32 s20, 0, s3
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc
+; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
+; GFX8-NEXT: v_trunc_f32_e32 v1, v1
+; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1
+; GFX8-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v11, v0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v1
; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v12, 0
-; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1]
-; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v11
-; GFX8-NEXT: v_cndmask_b32_e64 v10, v3, v10, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v15, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s5, v5, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v3, v5, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v12, v[1:2]
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v9, v16, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1]
-; GFX8-NEXT: v_mul_lo_u32 v8, v12, v1
-; GFX8-NEXT: v_mul_hi_u32 v2, v12, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v5, v0
-; GFX8-NEXT: v_xor_b32_e32 v9, s17, v10
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8
-; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[0:1]
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[18:19], s5, v11, 0
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v3, v14, vcc
+; GFX8-NEXT: s_subb_u32 s20, 0, s3
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v12, v[1:2]
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v5, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v11, v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[0:1]
+; GFX8-NEXT: v_mul_lo_u32 v1, v12, v0
+; GFX8-NEXT: v_mul_lo_u32 v2, v11, v4
+; GFX8-NEXT: v_mul_hi_u32 v3, v11, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v12, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[0:1]
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v3, v5, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2
-; GFX8-NEXT: v_mul_hi_u32 v8, v12, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v3, v12, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_mul_hi_u32 v2, v11, v4
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
-; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8
-; GFX8-NEXT: v_mul_hi_u32 v1, v5, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v12, v0
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v8, 0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; GFX8-NEXT: v_xor_b32_e32 v1, s16, v4
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v5, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v10, s17
-; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s16, v1
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v8, v[3:4]
-; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v9, v10, vcc
-; GFX8-NEXT: v_xor_b32_e32 v4, s4, v7
-; GFX8-NEXT: v_mul_lo_u32 v7, v5, v2
-; GFX8-NEXT: v_mul_lo_u32 v9, v8, v3
-; GFX8-NEXT: v_mul_hi_u32 v11, v8, v2
-; GFX8-NEXT: v_mul_hi_u32 v2, v5, v2
-; GFX8-NEXT: v_xor_b32_e32 v6, s4, v6
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9
-; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v11
+; GFX8-NEXT: v_mul_hi_u32 v3, v12, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v11, v0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v10, 0
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v12, v1, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v11, v[3:4]
+; GFX8-NEXT: v_xor_b32_e32 v6, s16, v6
+; GFX8-NEXT: v_xor_b32_e32 v1, s17, v7
+; GFX8-NEXT: v_mov_b32_e32 v7, s17
+; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s16, v6
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s20, v10, v[4:5]
+; GFX8-NEXT: v_mul_lo_u32 v4, v11, v2
+; GFX8-NEXT: v_xor_b32_e32 v3, s4, v9
+; GFX8-NEXT: v_mul_lo_u32 v7, v10, v6
+; GFX8-NEXT: v_mul_hi_u32 v9, v10, v2
+; GFX8-NEXT: v_mul_hi_u32 v2, v11, v2
+; GFX8-NEXT: v_xor_b32_e32 v5, s4, v8
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v11, v5, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7
-; GFX8-NEXT: v_mul_hi_u32 v9, v8, v3
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v11, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v9
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v9
+; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v9, v11, v6
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4
+; GFX8-NEXT: v_mul_hi_u32 v7, v10, v6
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2
; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, v11, v9
-; GFX8-NEXT: v_mul_hi_u32 v3, v5, v3
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
-; GFX8-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NEXT: v_mul_lo_u32 v7, s9, v2
-; GFX8-NEXT: v_mul_lo_u32 v8, s8, v3
-; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v10, vcc
-; GFX8-NEXT: v_mul_hi_u32 v6, s8, v2
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT: v_mul_hi_u32 v6, v11, v6
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
+; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v10, v2
+; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v11, v4, vcc
+; GFX8-NEXT: v_mul_lo_u32 v7, s11, v2
+; GFX8-NEXT: v_mul_lo_u32 v9, s10, v6
+; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v3
+; GFX8-NEXT: v_mul_hi_u32 v3, s10, v2
+; GFX8-NEXT: v_mov_b32_e32 v8, s4
+; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v5, v8, vcc
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v7, s9, v3
-; GFX8-NEXT: v_mul_hi_u32 v2, s9, v2
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6
-; GFX8-NEXT: v_mul_hi_u32 v8, s8, v3
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v7, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v7, s11, v6
+; GFX8-NEXT: v_mul_hi_u32 v2, s11, v2
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v8, v3
+; GFX8-NEXT: v_mul_hi_u32 v8, s10, v6
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v8
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v2, v6
-; GFX8-NEXT: v_mul_hi_u32 v9, s9, v3
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v8, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
-; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v9, v[3:4]
-; GFX8-NEXT: v_mov_b32_e32 v10, s9
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v2
-; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v8, v[6:7]
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v2, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT: v_mul_hi_u32 v6, s11, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, v6, v7
+; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4]
+; GFX8-NEXT: v_mov_b32_e32 v12, s11
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s10, v2
+; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7]
; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: v_subb_u32_e64 v7, s[0:1], v10, v6, vcc
-; GFX8-NEXT: v_sub_u32_e64 v6, s[0:1], s9, v6
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7
-; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1]
+; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v12, v8, vcc
+; GFX8-NEXT: v_sub_u32_e64 v7, s[0:1], s11, v8
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6
+; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v7
-; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[0:1]
-; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, s2, v2
-; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v6, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1]
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v6
+; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[0:1]
+; GFX8-NEXT: v_subrev_u32_e32 v9, vcc, s2, v2
+; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v7, vcc
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12
; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v9
; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12
; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1]
-; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v8
-; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc
-; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v9, s[0:1]
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v14
+; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v10
+; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc
+; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v11, s[0:1]
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 1, v14
; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s2, v11
+; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s2, v9
; GFX8-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v14, v7, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10
-; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v8, v9, v14, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc
+; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v13, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v7, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v8, v11, v14, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1]
-; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11]
-; GFX8-NEXT: v_xor_b32_e32 v2, s0, v6
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1]
+; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9]
+; GFX8-NEXT: v_xor_b32_e32 v2, s0, v7
; GFX8-NEXT: v_xor_b32_e32 v3, s1, v8
-; GFX8-NEXT: v_mov_b32_e32 v6, s1
+; GFX8-NEXT: v_mov_b32_e32 v7, s1
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v2
-; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc
-; GFX8-NEXT: v_xor_b32_e32 v6, s6, v9
-; GFX8-NEXT: v_xor_b32_e32 v7, s6, v7
-; GFX8-NEXT: v_mov_b32_e32 v8, s6
-; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s6, v6
-; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v8, vcc
+; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; GFX8-NEXT: v_xor_b32_e32 v7, s6, v9
+; GFX8-NEXT: v_xor_b32_e32 v8, s6, v6
+; GFX8-NEXT: v_mov_b32_e32 v9, s6
+; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s6, v7
+; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v8, v9, vcc
; GFX8-NEXT: v_mov_b32_e32 v8, s12
; GFX8-NEXT: v_mov_b32_e32 v9, s13
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
@@ -1619,69 +1618,70 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: s_subb_u32 s17, 0, s9
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX9-NEXT: v_trunc_f32_e32 v2, v1
-; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
-; GFX9-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
+; GFX9-NEXT: v_trunc_f32_e32 v1, v1
+; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1
+; GFX9-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v1
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add_u32_e32 v1, v3, v1
+; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX9-NEXT: v_mov_b32_e32 v8, s11
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4
; GFX9-NEXT: s_xor_b64 s[16:17], s[4:5], s[6:7]
; GFX9-NEXT: s_ashr_i32 s6, s19, 31
; GFX9-NEXT: s_mov_b32 s7, s6
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1
-; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v6, v5
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
; GFX9-NEXT: v_mul_lo_u32 v2, s11, v0
; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1
; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0
; GFX9-NEXT: v_mul_hi_u32 v0, s11, v0
-; GFX9-NEXT: v_mul_hi_u32 v6, s11, v1
+; GFX9-NEXT: v_mul_hi_u32 v5, s11, v1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
@@ -1693,205 +1693,203 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v6, s11
-; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s10, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v4, s9
-; GFX9-NEXT: s_ashr_i32 s10, s3, 31
-; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX9-NEXT: v_sub_u32_e32 v0, s11, v1
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX9-NEXT: v_add3_u32 v7, v3, v2, v5
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2]
+; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s10, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v8, v4, vcc
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8
+; GFX9-NEXT: v_sub_u32_e32 v0, s11, v4
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6
-; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s8, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v2, s[0:1]
-; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[0:1], 0, v0, vcc
-; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v5
-; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8
+; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s8, v9
+; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v0, vcc
+; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], 1, v6
+; GFX9-NEXT: v_addc_co_u32_e64 v10, s[0:1], 0, v7, s[0:1]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4
+; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v3
+; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10
-; GFX9-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[0:1]
-; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v2
-; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s18, s6
-; GFX9-NEXT: s_addc_u32 s1, s19, s6
-; GFX9-NEXT: s_add_u32 s2, s2, s10
-; GFX9-NEXT: s_mov_b32 s11, s10
-; GFX9-NEXT: s_addc_u32 s3, s3, s10
-; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4
+; GFX9-NEXT: v_subrev_co_u32_e32 v14, vcc, s8, v3
+; GFX9-NEXT: s_ashr_i32 s8, s3, 31
+; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1]
+; GFX9-NEXT: v_add_co_u32_e64 v12, s[0:1], 1, v5
+; GFX9-NEXT: s_add_u32 s10, s18, s6
+; GFX9-NEXT: v_addc_co_u32_e64 v13, s[0:1], 0, v10, s[0:1]
+; GFX9-NEXT: s_addc_u32 s11, s19, s6
+; GFX9-NEXT: s_add_u32 s0, s2, s8
+; GFX9-NEXT: s_mov_b32 s9, s8
+; GFX9-NEXT: s_addc_u32 s1, s3, s8
+; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], s[8:9]
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
-; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s2
-; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
-; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1
-; GFX9-NEXT: v_add_f32_e32 v1, v1, v15
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s8, v9
; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc
-; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1
-; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX9-NEXT: v_trunc_f32_e32 v16, v1
-; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v16
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
+; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
; GFX9-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v0
-; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7]
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT: s_xor_b64 s[10:11], s[10:11], s[6:7]
; GFX9-NEXT: s_sub_u32 s5, 0, s2
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v17, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v12, v2, v13, vcc
-; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v16
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc
+; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
+; GFX9-NEXT: v_trunc_f32_e32 v1, v1
+; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1
+; GFX9-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v11, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[18:19], s5, v11, 0
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v3, v14, vcc
; GFX9-NEXT: s_subb_u32 s20, 0, s3
-; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v13, v[1:2]
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, v11, s[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v17, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v13, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v15, vcc
-; GFX9-NEXT: v_mul_lo_u32 v3, v17, v1
-; GFX9-NEXT: v_mul_hi_u32 v10, v17, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v13, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1]
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
-; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v10
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v12, v[1:2]
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v5, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v11, v[2:3]
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[0:1]
+; GFX9-NEXT: v_mul_lo_u32 v1, v12, v0
+; GFX9-NEXT: v_mul_lo_u32 v2, v11, v4
+; GFX9-NEXT: v_mul_hi_u32 v3, v11, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v12, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[0:1]
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v10, v13, v1
-; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT: v_mul_hi_u32 v3, v17, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v13, v1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v3, v12, v4
+; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT: v_mul_hi_u32 v2, v11, v4
+; GFX9-NEXT: v_xor_b32_e32 v6, s16, v6
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v3, v10, v3
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v17, v0
-; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v10, 0
-; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v13, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v0, v3
-; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v4, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v11, v[0:1]
-; GFX9-NEXT: v_xor_b32_e32 v5, s16, v5
-; GFX9-NEXT: v_xor_b32_e32 v8, s17, v8
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v10, v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v9, s17
-; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v5
-; GFX9-NEXT: v_xor_b32_e32 v4, s4, v7
-; GFX9-NEXT: v_mul_lo_u32 v5, v11, v2
-; GFX9-NEXT: v_mul_lo_u32 v7, v10, v3
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v8, v9, vcc
-; GFX9-NEXT: v_mul_hi_u32 v8, v10, v2
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v8, v11, v3
+; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT: v_mul_hi_u32 v3, v12, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v0
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v10, 0
+; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v12, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v11, v[3:4]
+; GFX9-NEXT: v_xor_b32_e32 v1, s17, v7
+; GFX9-NEXT: v_mov_b32_e32 v7, s17
+; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v6
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s20, v10, v[4:5]
+; GFX9-NEXT: v_mul_lo_u32 v4, v11, v2
+; GFX9-NEXT: v_xor_b32_e32 v3, s4, v9
+; GFX9-NEXT: v_mul_lo_u32 v7, v10, v6
+; GFX9-NEXT: v_mul_hi_u32 v9, v10, v2
; GFX9-NEXT: v_mul_hi_u32 v2, v11, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT: v_mul_hi_u32 v7, v10, v3
-; GFX9-NEXT: v_mul_hi_u32 v3, v11, v3
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_add_u32_e32 v7, v8, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v3, v7, v5, v3
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v11, v3, vcc
-; GFX9-NEXT: v_mul_lo_u32 v5, s9, v2
-; GFX9-NEXT: v_mul_lo_u32 v7, s8, v3
-; GFX9-NEXT: v_mul_hi_u32 v9, s8, v2
-; GFX9-NEXT: v_mul_hi_u32 v2, s9, v2
-; GFX9-NEXT: v_mul_hi_u32 v12, s9, v3
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7
+; GFX9-NEXT: v_xor_b32_e32 v5, s4, v8
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7
; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v9, s9, v3
-; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT: v_mul_hi_u32 v7, s8, v3
-; GFX9-NEXT: v_xor_b32_e32 v6, s4, v6
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9
+; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v9, v11, v6
+; GFX9-NEXT: v_add_u32_e32 v4, v7, v4
+; GFX9-NEXT: v_mul_hi_u32 v7, v10, v6
+; GFX9-NEXT: v_mul_hi_u32 v6, v11, v6
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7
; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v2, v5
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX9-NEXT: v_add_u32_e32 v7, v9, v7
+; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v4, v7, v4, v6
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v11, v4, vcc
+; GFX9-NEXT: v_mul_lo_u32 v7, s11, v2
+; GFX9-NEXT: v_mul_lo_u32 v9, s10, v6
+; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s4, v3
+; GFX9-NEXT: v_mul_hi_u32 v3, s10, v2
; GFX9-NEXT: v_mov_b32_e32 v8, s4
-; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s4, v4
-; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v8, vcc
-; GFX9-NEXT: v_add_u32_e32 v6, v9, v7
-; GFX9-NEXT: v_add3_u32 v8, v6, v11, v12
-; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v8, v[3:4]
-; GFX9-NEXT: v_mov_b32_e32 v9, s9
-; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v2
-; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v10, v[6:7]
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v8, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9
+; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v7, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v7, s11, v6
+; GFX9-NEXT: v_mul_hi_u32 v2, s11, v2
+; GFX9-NEXT: v_add_u32_e32 v3, v8, v3
+; GFX9-NEXT: v_mul_hi_u32 v8, s10, v6
+; GFX9-NEXT: v_mul_hi_u32 v6, s11, v6
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v8
+; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v2, v3
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0
+; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX9-NEXT: v_add_u32_e32 v7, v7, v8
+; GFX9-NEXT: v_add3_u32 v11, v7, v9, v6
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4]
+; GFX9-NEXT: v_mov_b32_e32 v12, s11
+; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s10, v2
+; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7]
; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: v_subb_co_u32_e64 v7, s[0:1], v9, v6, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7
-; GFX9-NEXT: v_sub_u32_e32 v6, s9, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1]
+; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v12, v8, vcc
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6
+; GFX9-NEXT: v_sub_u32_e32 v7, s11, v8
+; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v7
-; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[0:1]
-; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s2, v2
-; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v6, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v6
+; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[0:1]
+; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s2, v2
+; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v9
; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12
; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1]
; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v10
-; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v6, v3, vcc
-; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1]
-; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 1, v14
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
+; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v11, s[0:1]
+; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v14
; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v15, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v14, v7, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc
-; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s2, v11
+; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s2, v9
; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v15, vcc
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v6, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v14, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v7, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v11, v14, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1]
-; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11]
-; GFX9-NEXT: v_xor_b32_e32 v2, s0, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1]
+; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9]
+; GFX9-NEXT: v_xor_b32_e32 v2, s0, v7
; GFX9-NEXT: v_xor_b32_e32 v3, s1, v8
-; GFX9-NEXT: v_mov_b32_e32 v6, s1
+; GFX9-NEXT: v_mov_b32_e32 v7, s1
; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v6, vcc
-; GFX9-NEXT: v_xor_b32_e32 v6, s6, v9
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc
+; GFX9-NEXT: v_xor_b32_e32 v7, s6, v9
; GFX9-NEXT: v_mov_b32_e32 v13, 0
-; GFX9-NEXT: v_xor_b32_e32 v7, s6, v7
-; GFX9-NEXT: v_mov_b32_e32 v8, s6
-; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v6
-; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v8, vcc
+; GFX9-NEXT: v_xor_b32_e32 v8, s6, v6
+; GFX9-NEXT: v_mov_b32_e32 v9, s6
+; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v7
+; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v8, v9, vcc
; GFX9-NEXT: global_store_dwordx4 v13, v[0:3], s[12:13]
; GFX9-NEXT: global_store_dwordx4 v13, v[4:7], s[14:15]
; GFX9-NEXT: s_endpgm
@@ -1917,21 +1915,21 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX10-NEXT: s_subb_u32 s20, 0, s7
; GFX10-NEXT: s_xor_b64 s[16:17], s[4:5], s[8:9]
; GFX10-NEXT: s_ashr_i32 s8, s19, 31
+; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX10-NEXT: s_ashr_i32 s10, s3, 31
+; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1
; GFX10-NEXT: s_add_u32 s18, s18, s8
; GFX10-NEXT: s_addc_u32 s19, s19, s8
-; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6
-; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1
; GFX10-NEXT: s_add_u32 s2, s2, s10
; GFX10-NEXT: s_mov_b32 s11, s10
; GFX10-NEXT: s_addc_u32 s3, s3, s10
-; GFX10-NEXT: s_mov_b32 s9, s8
-; GFX10-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11]
; GFX10-NEXT: v_add_f32_e32 v0, v1, v0
+; GFX10-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11]
+; GFX10-NEXT: s_mov_b32 s9, s8
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3
; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s2
-; GFX10-NEXT: s_xor_b64 s[18:19], s[18:19], s[8:9]
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX10-NEXT: s_xor_b64 s[18:19], s[18:19], s[8:9]
; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1
; GFX10-NEXT: v_add_f32_e32 v1, v1, v2
; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1940,256 +1938,253 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX10-NEXT: v_trunc_f32_e32 v2, v2
; GFX10-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v1
; GFX10-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
+; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v2
; GFX10-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3
-; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v2
; GFX10-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX10-NEXT: v_trunc_f32_e32 v6, v4
-; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v0
-; GFX10-NEXT: v_mul_f32_e32 v4, 0xcf800000, v6
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s21, v7, 0
+; GFX10-NEXT: v_trunc_f32_e32 v5, v4
+; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX10-NEXT: v_mul_f32_e32 v4, 0xcf800000, v5
+; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v5
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s21, v6, 0
; GFX10-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX10-NEXT: s_sub_u32 s5, 0, s2
; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v3
-; GFX10-NEXT: v_mul_hi_u32 v10, v9, v0
-; GFX10-NEXT: v_mad_u64_u32 v[2:3], s22, s5, v8, 0
-; GFX10-NEXT: v_mad_u64_u32 v[4:5], s22, s21, v9, v[1:2]
-; GFX10-NEXT: v_cvt_u32_f32_e32 v5, v6
-; GFX10-NEXT: v_mov_b32_e32 v1, v3
-; GFX10-NEXT: v_mul_hi_u32 v6, v7, v0
+; GFX10-NEXT: v_mad_u64_u32 v[3:4], s5, s21, v7, v[1:2]
+; GFX10-NEXT: s_sub_u32 s5, 0, s2
+; GFX10-NEXT: v_mul_lo_u32 v10, v7, v0
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s23, s5, v8, 0
; GFX10-NEXT: s_subb_u32 s22, 0, s3
-; GFX10-NEXT: v_mul_hi_u32 v12, v8, v2
-; GFX10-NEXT: v_mul_lo_u32 v11, v5, v2
-; GFX10-NEXT: v_mad_u64_u32 v[3:4], s23, s20, v7, v[4:5]
-; GFX10-NEXT: v_mul_lo_u32 v4, v9, v0
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s23, s5, v5, v[1:2]
-; GFX10-NEXT: v_mul_hi_u32 v2, v5, v2
-; GFX10-NEXT: v_mul_lo_u32 v13, v7, v3
-; GFX10-NEXT: v_mul_lo_u32 v14, v9, v3
-; GFX10-NEXT: v_mul_hi_u32 v15, v7, v3
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s23, s22, v8, v[0:1]
-; GFX10-NEXT: v_mul_hi_u32 v1, v9, v3
-; GFX10-NEXT: v_add_co_u32 v3, s23, v4, v13
+; GFX10-NEXT: v_mul_hi_u32 v12, v7, v0
+; GFX10-NEXT: v_mad_u64_u32 v[3:4], s23, s20, v6, v[3:4]
+; GFX10-NEXT: v_mul_hi_u32 v11, v6, v0
+; GFX10-NEXT: v_mul_hi_u32 v14, v9, v1
+; GFX10-NEXT: v_mad_u64_u32 v[4:5], s23, s5, v9, v[2:3]
+; GFX10-NEXT: v_mul_hi_u32 v5, v8, v1
+; GFX10-NEXT: v_mul_lo_u32 v13, v6, v3
+; GFX10-NEXT: v_mul_lo_u32 v15, v7, v3
+; GFX10-NEXT: v_mul_lo_u32 v2, v9, v1
+; GFX10-NEXT: v_mul_hi_u32 v16, v6, v3
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s23, s22, v8, v[4:5]
+; GFX10-NEXT: v_mul_hi_u32 v1, v7, v3
+; GFX10-NEXT: v_add_co_u32 v3, s23, v10, v13
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s23
-; GFX10-NEXT: v_add_co_u32 v10, s23, v14, v10
-; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s23
-; GFX10-NEXT: v_mul_lo_u32 v14, v8, v0
-; GFX10-NEXT: v_add_co_u32 v3, s23, v3, v6
+; GFX10-NEXT: v_add_co_u32 v10, s23, v15, v12
+; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s23
+; GFX10-NEXT: v_mul_lo_u32 v13, v8, v0
+; GFX10-NEXT: v_add_co_u32 v3, s23, v3, v11
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s23
-; GFX10-NEXT: v_add_co_u32 v6, s23, v10, v15
-; GFX10-NEXT: v_mul_lo_u32 v15, v5, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s23
+; GFX10-NEXT: v_mul_lo_u32 v15, v9, v0
+; GFX10-NEXT: v_add_co_u32 v10, s23, v10, v16
+; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s23
; GFX10-NEXT: v_mul_hi_u32 v16, v8, v0
-; GFX10-NEXT: v_mul_hi_u32 v17, v5, v0
+; GFX10-NEXT: v_mul_hi_u32 v17, v9, v0
; GFX10-NEXT: v_add_nc_u32_e32 v0, v4, v3
-; GFX10-NEXT: v_add_co_u32 v4, s23, v11, v14
-; GFX10-NEXT: v_add_nc_u32_e32 v3, v13, v10
-; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s23
-; GFX10-NEXT: v_add_co_u32 v2, s23, v15, v2
-; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s23
-; GFX10-NEXT: v_add_co_u32 v0, s23, v6, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s23
-; GFX10-NEXT: v_add_co_u32 v4, s23, v4, v12
+; GFX10-NEXT: v_add_co_u32 v2, s23, v2, v13
+; GFX10-NEXT: v_add_nc_u32_e32 v3, v12, v11
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s23
-; GFX10-NEXT: v_add_co_u32 v2, s23, v2, v16
-; GFX10-NEXT: v_add3_u32 v1, v3, v6, v1
-; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v7, v0
-; GFX10-NEXT: v_add_nc_u32_e32 v3, v10, v4
+; GFX10-NEXT: v_add_co_u32 v11, s23, v15, v14
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s23
-; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v9, v1, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v0, s23, v10, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s23
+; GFX10-NEXT: v_add_co_u32 v2, s23, v2, v5
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s23
+; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v6, v0
+; GFX10-NEXT: v_add3_u32 v1, v3, v10, v1
+; GFX10-NEXT: v_add_co_u32 v5, s23, v11, v16
+; GFX10-NEXT: v_add_nc_u32_e32 v2, v4, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s23
+; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v1, vcc_lo
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s23, s21, v6, 0
-; GFX10-NEXT: v_add_co_u32 v2, s23, v2, v3
-; GFX10-NEXT: v_add_nc_u32_e32 v4, v11, v12
-; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s23
-; GFX10-NEXT: v_mov_b32_e32 v10, 0
+; GFX10-NEXT: v_add_co_u32 v2, s23, v5, v2
+; GFX10-NEXT: v_add_nc_u32_e32 v3, v12, v11
+; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s23
; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v8, v2
-; GFX10-NEXT: v_mul_hi_u32 v11, v7, v0
-; GFX10-NEXT: v_add3_u32 v3, v4, v3, v17
-; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v3, vcc_lo
-; GFX10-NEXT: v_mad_u64_u32 v[2:3], s23, s5, v8, 0
-; GFX10-NEXT: v_mad_u64_u32 v[4:5], s21, s21, v7, v[1:2]
-; GFX10-NEXT: v_mov_b32_e32 v1, v3
-; GFX10-NEXT: v_mul_lo_u32 v12, v9, v2
-; GFX10-NEXT: v_mul_hi_u32 v13, v8, v2
-; GFX10-NEXT: v_mad_u64_u32 v[3:4], s20, s20, v6, v[4:5]
-; GFX10-NEXT: v_mul_lo_u32 v4, v7, v0
-; GFX10-NEXT: v_mul_hi_u32 v5, v6, v0
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s5, v9, v[1:2]
-; GFX10-NEXT: v_mul_hi_u32 v2, v9, v2
-; GFX10-NEXT: v_mul_lo_u32 v14, v6, v3
+; GFX10-NEXT: v_mul_lo_u32 v10, v7, v0
+; GFX10-NEXT: v_add3_u32 v5, v3, v4, v17
+; GFX10-NEXT: v_mad_u64_u32 v[3:4], s21, s21, v7, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s21, s5, v8, 0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v5, vcc_lo
+; GFX10-NEXT: v_mul_hi_u32 v12, v7, v0
+; GFX10-NEXT: v_mul_hi_u32 v11, v6, v0
+; GFX10-NEXT: v_mad_u64_u32 v[3:4], s20, s20, v6, v[3:4]
+; GFX10-NEXT: v_mul_hi_u32 v14, v9, v1
+; GFX10-NEXT: v_mad_u64_u32 v[4:5], s5, s5, v9, v[2:3]
+; GFX10-NEXT: v_mul_hi_u32 v5, v8, v1
+; GFX10-NEXT: v_mul_lo_u32 v13, v6, v3
; GFX10-NEXT: v_mul_lo_u32 v15, v7, v3
+; GFX10-NEXT: v_mul_lo_u32 v2, v9, v1
; GFX10-NEXT: v_mul_hi_u32 v16, v6, v3
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s22, v8, v[0:1]
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s22, v8, v[4:5]
; GFX10-NEXT: v_mul_hi_u32 v1, v7, v3
-; GFX10-NEXT: v_add_co_u32 v3, s5, v4, v14
+; GFX10-NEXT: v_add_co_u32 v3, s5, v10, v13
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5
-; GFX10-NEXT: v_add_co_u32 v11, s5, v15, v11
-; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s5
-; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v5
-; GFX10-NEXT: v_mul_lo_u32 v15, v8, v0
+; GFX10-NEXT: v_add_co_u32 v10, s5, v15, v12
+; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5
+; GFX10-NEXT: v_mul_lo_u32 v13, v8, v0
+; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v11
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s5
-; GFX10-NEXT: v_add_co_u32 v5, s5, v11, v16
-; GFX10-NEXT: v_mul_lo_u32 v16, v9, v0
+; GFX10-NEXT: v_mul_lo_u32 v15, v9, v0
+; GFX10-NEXT: v_add_co_u32 v10, s5, v10, v16
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5
; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3
-; GFX10-NEXT: v_mul_hi_u32 v17, v8, v0
-; GFX10-NEXT: v_mul_hi_u32 v0, v9, v0
-; GFX10-NEXT: v_add_nc_u32_e32 v4, v14, v11
-; GFX10-NEXT: v_add_co_u32 v11, s5, v12, v15
-; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5
-; GFX10-NEXT: v_add_co_u32 v2, s5, v16, v2
-; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s5
-; GFX10-NEXT: v_add_co_u32 v3, s5, v5, v3
-; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5
-; GFX10-NEXT: v_add_co_u32 v11, s5, v11, v13
-; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5
-; GFX10-NEXT: v_add_co_u32 v2, s5, v2, v17
-; GFX10-NEXT: v_add3_u32 v1, v4, v5, v1
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v6, v3
+; GFX10-NEXT: v_add_co_u32 v2, s5, v2, v13
+; GFX10-NEXT: v_mul_hi_u32 v16, v8, v0
; GFX10-NEXT: v_add_nc_u32_e32 v4, v12, v11
+; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5
+; GFX10-NEXT: v_add_co_u32 v12, s5, v15, v14
; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s5
+; GFX10-NEXT: v_add_co_u32 v3, s5, v10, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s5
+; GFX10-NEXT: v_add_co_u32 v2, s5, v2, v5
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v6, v3
+; GFX10-NEXT: v_add3_u32 v1, v4, v10, v1
+; GFX10-NEXT: v_add_co_u32 v5, s5, v12, v16
+; GFX10-NEXT: v_add_nc_u32_e32 v2, v11, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v1, vcc_lo
+; GFX10-NEXT: v_mul_hi_u32 v0, v9, v0
; GFX10-NEXT: v_mul_lo_u32 v6, s1, v3
-; GFX10-NEXT: v_add_co_u32 v2, s5, v2, v4
-; GFX10-NEXT: v_add_nc_u32_e32 v5, v14, v13
-; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5
-; GFX10-NEXT: v_mul_lo_u32 v11, s0, v1
+; GFX10-NEXT: v_add_co_u32 v2, s5, v5, v2
+; GFX10-NEXT: v_mul_lo_u32 v10, s0, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v4, v13, v12
+; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5
; GFX10-NEXT: v_mul_hi_u32 v7, s0, v3
; GFX10-NEXT: v_mul_hi_u32 v3, s1, v3
-; GFX10-NEXT: v_mul_lo_u32 v12, s1, v1
-; GFX10-NEXT: v_add3_u32 v0, v5, v4, v0
+; GFX10-NEXT: v_mul_lo_u32 v11, s1, v1
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v8, v2
+; GFX10-NEXT: v_add3_u32 v0, v4, v5, v0
; GFX10-NEXT: v_mul_hi_u32 v4, s0, v1
; GFX10-NEXT: v_mul_hi_u32 v5, s1, v1
-; GFX10-NEXT: v_add_co_u32 v1, s5, v6, v11
-; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v9, v0, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v1, s5, v6, v10
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s5
-; GFX10-NEXT: v_add_co_u32 v3, s5, v12, v3
-; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5
+; GFX10-NEXT: v_add_co_u32 v3, s5, v11, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v1, s5, v1, v7
-; GFX10-NEXT: v_mul_lo_u32 v0, s19, v2
-; GFX10-NEXT: v_mul_lo_u32 v12, s18, v8
+; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v9, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s5
+; GFX10-NEXT: v_mul_lo_u32 v0, s19, v2
; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v4
+; GFX10-NEXT: v_mul_lo_u32 v7, s18, v8
+; GFX10-NEXT: v_add_nc_u32_e32 v1, v6, v1
; GFX10-NEXT: v_mul_hi_u32 v9, s18, v2
-; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5
; GFX10-NEXT: v_mul_hi_u32 v2, s19, v2
-; GFX10-NEXT: v_mul_lo_u32 v7, s19, v8
-; GFX10-NEXT: v_add_nc_u32_e32 v1, v6, v1
-; GFX10-NEXT: v_add_co_u32 v6, s5, v0, v12
-; GFX10-NEXT: v_mul_hi_u32 v13, s18, v8
-; GFX10-NEXT: v_add_nc_u32_e32 v4, v11, v4
-; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5
-; GFX10-NEXT: v_add_co_u32 v12, s5, v3, v1
-; GFX10-NEXT: v_add_co_u32 v2, s20, v7, v2
-; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s5
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s6, v12, 0
-; GFX10-NEXT: v_add_co_u32 v6, s5, v6, v9
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s5
-; GFX10-NEXT: v_add_co_u32 v9, s5, v2, v13
-; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s20
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5
-; GFX10-NEXT: v_add3_u32 v4, v4, v7, v5
-; GFX10-NEXT: v_add_nc_u32_e32 v6, v11, v6
-; GFX10-NEXT: v_mul_hi_u32 v5, s19, v8
-; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v12, 1
-; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5
+; GFX10-NEXT: v_mul_lo_u32 v6, s19, v8
+; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v1
+; GFX10-NEXT: v_add_co_u32 v7, s20, v0, v7
+; GFX10-NEXT: v_add_nc_u32_e32 v4, v10, v4
+; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s6, v3, 0
+; GFX10-NEXT: v_mul_hi_u32 v11, s18, v8
+; GFX10-NEXT: v_add_co_u32 v6, s5, v6, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s5
+; GFX10-NEXT: v_add3_u32 v4, v4, v12, v5
+; GFX10-NEXT: v_add_co_u32 v2, s5, v7, v9
+; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s20
+; GFX10-NEXT: v_mul_hi_u32 v7, s19, v8
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s5, s6, v4, v[1:2]
-; GFX10-NEXT: v_add_co_u32 v6, s5, v9, v6
-; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5
-; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v4, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v7, 1
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s5, s7, v12, v[1:2]
-; GFX10-NEXT: v_add3_u32 v5, v3, v9, v5
-; GFX10-NEXT: v_mad_u64_u32 v[2:3], s5, s2, v6, 0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v8, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, s0, v0
-; GFX10-NEXT: v_sub_nc_u32_e32 v9, s1, v1
-; GFX10-NEXT: v_sub_co_ci_u32_e64 v15, s0, s1, v1, vcc_lo
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v9, vcc_lo, s7, v9, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v3, vcc_lo, v14, s6
-; GFX10-NEXT: v_subrev_co_ci_u32_e64 v16, s0, 0, v9, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v14
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v9, vcc_lo, s7, v9, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v6, s5, v6, v11
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5
+; GFX10-NEXT: v_add_nc_u32_e32 v5, v10, v5
+; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v3, 1
+; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v4, vcc_lo
+; GFX10-NEXT: v_add_nc_u32_e32 v8, v13, v2
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s5, s7, v3, v[1:2]
+; GFX10-NEXT: v_add_co_u32 v5, s5, v6, v5
+; GFX10-NEXT: v_sub_co_u32 v12, vcc_lo, s0, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5
+; GFX10-NEXT: v_mov_b32_e32 v9, 0
+; GFX10-NEXT: v_sub_nc_u32_e32 v6, s1, v1
+; GFX10-NEXT: v_sub_co_ci_u32_e64 v13, s0, s1, v1, vcc_lo
+; GFX10-NEXT: v_add3_u32 v7, v8, v2, v7
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v12
+; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc_lo
+; GFX10-NEXT: v_sub_co_u32 v15, vcc_lo, v12, s6
+; GFX10-NEXT: v_subrev_co_ci_u32_e64 v16, s0, 0, v6, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v13
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, -1, s0
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v3
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v15
; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, -1, s0
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v16
; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, -1, s0
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v15
-; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, -1, s0
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v5, v[0:1]
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v5, 0
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v13
+; GFX10-NEXT: v_cndmask_b32_e64 v14, v17, v14, s0
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v16
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v19, v18, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v15
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v17, v20, v17, s0
-; GFX10-NEXT: v_sub_co_u32 v1, s0, v3, s6
-; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v9, s0
-; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s3, v6, v[0:1]
-; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v17
-; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v13, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v12, v7, s0
-; GFX10-NEXT: v_cndmask_b32_e32 v7, v16, v9, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, s18, v2
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v8, s0
-; GFX10-NEXT: v_sub_co_ci_u32_e64 v8, s1, s19, v0, vcc_lo
-; GFX10-NEXT: v_sub_nc_u32_e32 v0, s19, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v14, v3, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v15, v7, s0
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v8
-; GFX10-NEXT: v_xor_b32_e32 v1, s16, v1
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v0, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2
-; GFX10-NEXT: v_xor_b32_e32 v4, s17, v4
-; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s0
-; GFX10-NEXT: v_xor_b32_e32 v3, s4, v3
-; GFX10-NEXT: v_xor_b32_e32 v7, s4, v7
+; GFX10-NEXT: v_cndmask_b32_e64 v17, v19, v18, s0
+; GFX10-NEXT: v_add_co_u32 v18, s0, v10, 1
+; GFX10-NEXT: v_add_co_ci_u32_e64 v19, s0, 0, v11, s0
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s2, v7, v[1:2]
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v17
+; GFX10-NEXT: v_sub_co_u32 v2, s0, v15, s6
+; GFX10-NEXT: v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v18, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v11, v19, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v15, v2, vcc_lo
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s1, s3, v5, v[1:2]
+; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v14
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, v8, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, v10, s0
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v16, v6, vcc_lo
+; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, s18, v0
+; GFX10-NEXT: v_sub_co_ci_u32_e64 v10, s1, s19, v1, vcc_lo
+; GFX10-NEXT: v_sub_nc_u32_e32 v1, s19, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v12, v11, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v13, v4, s0
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v10
+; GFX10-NEXT: v_xor_b32_e32 v0, s16, v2
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v1, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v8
+; GFX10-NEXT: v_xor_b32_e32 v2, s17, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, -1, s0
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, v2, s2
+; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, v8, s2
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v14, s0, 0, v11, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v0, s0, v1, s16
-; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s17, v4, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v8
+; GFX10-NEXT: v_sub_co_u32 v0, s0, v0, s16
+; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s17, v2, s0
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v10
+; GFX10-NEXT: v_xor_b32_e32 v2, s4, v6
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v11, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v9, v12, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v12, s0
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v14
-; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, -1, s0
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v13
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0
-; GFX10-NEXT: v_add_co_u32 v15, s0, v6, 1
-; GFX10-NEXT: v_add_co_ci_u32_e64 v16, s0, 0, v5, s0
+; GFX10-NEXT: v_add_co_u32 v15, s0, v5, 1
+; GFX10-NEXT: v_add_co_ci_u32_e64 v16, s0, 0, v7, s0
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v14
-; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v12, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v12, s0
; GFX10-NEXT: v_add_co_u32 v12, s0, v15, 1
; GFX10-NEXT: v_add_co_ci_u32_e64 v17, s0, 0, v16, s0
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9
-; GFX10-NEXT: v_sub_co_u32 v9, s0, v13, s2
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX10-NEXT: v_sub_co_u32 v6, s0, v13, s2
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v11, s0, 0, v11, s0
; GFX10-NEXT: v_cndmask_b32_e32 v12, v15, v12, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v4
+; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v3
; GFX10-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v13, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v9, v14, v11, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v12, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v11, v5, v15, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v4, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v9, s0
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v6, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v14, v11, vcc_lo
+; GFX10-NEXT: v_xor_b32_e32 v11, s4, v4
+; GFX10-NEXT: v_cndmask_b32_e64 v12, v5, v12, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v15, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v8, v3, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v6, s0
; GFX10-NEXT: s_xor_b64 s[0:1], s[8:9], s[10:11]
-; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v3, s4
-; GFX10-NEXT: v_xor_b32_e32 v3, s0, v6
-; GFX10-NEXT: v_xor_b32_e32 v6, s1, v11
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s4, v7, vcc_lo
-; GFX10-NEXT: v_xor_b32_e32 v7, s8, v2
-; GFX10-NEXT: v_xor_b32_e32 v8, s8, v8
-; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v3, s0
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v6, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v7, s8
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s8, v8, vcc_lo
-; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[12:13]
-; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[14:15]
+; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v2, s4
+; GFX10-NEXT: v_xor_b32_e32 v2, s0, v12
+; GFX10-NEXT: v_xor_b32_e32 v7, s1, v7
+; GFX10-NEXT: v_xor_b32_e32 v8, s8, v3
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s4, v11, vcc_lo
+; GFX10-NEXT: v_xor_b32_e32 v10, s8, v6
+; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v2, s0
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v7, vcc_lo
+; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v8, s8
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s8, v10, vcc_lo
+; GFX10-NEXT: global_store_dwordx4 v9, v[0:3], s[12:13]
+; GFX10-NEXT: global_store_dwordx4 v9, v[4:7], s[14:15]
; GFX10-NEXT: s_endpgm
%div = sdiv <2 x i64> %x, %y
store <2 x i64> %div, ptr addrspace(1) %out0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll
deleted file mode 100644
index 9f4a6f2..0000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll
+++ /dev/null
@@ -1,522 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
-; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
-
-; Test gfx9+ s_shl[1-4]_add_u32 pattern matching
-
-define amdgpu_ps i32 @s_shl1_add_u32(i32 inreg %src0, i32 inreg %src1) {
-; GFX9-LABEL: s_shl1_add_u32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshl1_add_u32 s0, s0, s1
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_shl1_add_u32:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshl_b32 s0, s0, 1
-; GFX8-NEXT: s_add_i32 s0, s0, s1
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_shl1_add_u32:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshl1_add_u32 s0, s0, s1
-; GFX10-NEXT: ; return to shader part epilog
- %shl = shl i32 %src0, 1
- %add = add i32 %shl, %src1
- ret i32 %add
-}
-
-define amdgpu_ps i32 @s_shl2_add_u32(i32 inreg %src0, i32 inreg %src1) {
-; GFX9-LABEL: s_shl2_add_u32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshl2_add_u32 s0, s0, s1
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_shl2_add_u32:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshl_b32 s0, s0, 2
-; GFX8-NEXT: s_add_i32 s0, s0, s1
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_shl2_add_u32:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshl2_add_u32 s0, s0, s1
-; GFX10-NEXT: ; return to shader part epilog
- %shl = shl i32 %src0, 2
- %add = add i32 %shl, %src1
- ret i32 %add
-}
-
-define amdgpu_ps i32 @s_shl3_add_u32(i32 inreg %src0, i32 inreg %src1) {
-; GFX9-LABEL: s_shl3_add_u32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshl3_add_u32 s0, s0, s1
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_shl3_add_u32:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshl_b32 s0, s0, 3
-; GFX8-NEXT: s_add_i32 s0, s0, s1
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_shl3_add_u32:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshl3_add_u32 s0, s0, s1
-; GFX10-NEXT: ; return to shader part epilog
- %shl = shl i32 %src0, 3
- %add = add i32 %shl, %src1
- ret i32 %add
-}
-
-define amdgpu_ps i32 @s_shl4_add_u32(i32 inreg %src0, i32 inreg %src1) {
-; GFX9-LABEL: s_shl4_add_u32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshl4_add_u32 s0, s0, s1
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_shl4_add_u32:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshl_b32 s0, s0, 4
-; GFX8-NEXT: s_add_i32 s0, s0, s1
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_shl4_add_u32:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshl4_add_u32 s0, s0, s1
-; GFX10-NEXT: ; return to shader part epilog
- %shl = shl i32 %src0, 4
- %add = add i32 %shl, %src1
- ret i32 %add
-}
-
-define amdgpu_ps i32 @s_shl5_add_u32(i32 inreg %src0, i32 inreg %src1) {
-; GCN-LABEL: s_shl5_add_u32:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_lshl_b32 s0, s0, 5
-; GCN-NEXT: s_add_i32 s0, s0, s1
-; GCN-NEXT: ; return to shader part epilog
- %shl = shl i32 %src0, 5
- %add = add i32 %shl, %src1
- ret i32 %add
-}
-
-define i32 @v_shl1_add_u32(i32 %src0, i32 %src1) {
-; GFX9-LABEL: v_shl1_add_u32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshl_add_u32 v0, v0, 1, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_shl1_add_u32:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_shl1_add_u32:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshl_add_u32 v0, v0, 1, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %shl = shl i32 %src0, 1
- %add = add i32 %shl, %src1
- ret i32 %add
-}
-
-define i32 @v_shl2_add_u32(i32 %src0, i32 %src1) {
-; GFX9-LABEL: v_shl2_add_u32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_shl2_add_u32:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_shl2_add_u32:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %shl = shl i32 %src0, 2
- %add = add i32 %shl, %src1
- ret i32 %add
-}
-
-define i32 @v_shl3_add_u32(i32 %src0, i32 %src1) {
-; GFX9-LABEL: v_shl3_add_u32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshl_add_u32 v0, v0, 3, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_shl3_add_u32:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_shl3_add_u32:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshl_add_u32 v0, v0, 3, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %shl = shl i32 %src0, 3
- %add = add i32 %shl, %src1
- ret i32 %add
-}
-
-define i32 @v_shl4_add_u32(i32 %src0, i32 %src1) {
-; GFX9-LABEL: v_shl4_add_u32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshl_add_u32 v0, v0, 4, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_shl4_add_u32:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_shl4_add_u32:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshl_add_u32 v0, v0, 4, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %shl = shl i32 %src0, 4
- %add = add i32 %shl, %src1
- ret i32 %add
-}
-
-define i32 @v_shl5_add_u32(i32 %src0, i32 %src1) {
-; GFX9-LABEL: v_shl5_add_u32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshl_add_u32 v0, v0, 5, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_shl5_add_u32:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 5, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_shl5_add_u32:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshl_add_u32 v0, v0, 5, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %shl = shl i32 %src0, 5
- %add = add i32 %shl, %src1
- ret i32 %add
-}
-
-; FIXME: Use v_lshl_add_u32
-; shift is scalar, but add is vector.
-define amdgpu_ps float @shl1_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
-; GFX9-LABEL: shl1_add_u32_vgpr1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshl_b32 s0, s0, 1
-; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: shl1_add_u32_vgpr1:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshl_b32 s0, s0, 1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: shl1_add_u32_vgpr1:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshl_b32 s0, s0, 1
-; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX10-NEXT: ; return to shader part epilog
- %shl = shl i32 %src0, 1
- %add = add i32 %shl, %src1
- %cast = bitcast i32 %add to float
- ret float %cast
-}
-
-define amdgpu_ps float @shl2_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
-; GFX9-LABEL: shl2_add_u32_vgpr1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshl_b32 s0, s0, 2
-; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: shl2_add_u32_vgpr1:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshl_b32 s0, s0, 2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: shl2_add_u32_vgpr1:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshl_b32 s0, s0, 2
-; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX10-NEXT: ; return to shader part epilog
- %shl = shl i32 %src0, 2
- %add = add i32 %shl, %src1
- %cast = bitcast i32 %add to float
- ret float %cast
-}
-
-define amdgpu_ps float @shl3_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
-; GFX9-LABEL: shl3_add_u32_vgpr1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshl_b32 s0, s0, 3
-; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: shl3_add_u32_vgpr1:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshl_b32 s0, s0, 3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: shl3_add_u32_vgpr1:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshl_b32 s0, s0, 3
-; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX10-NEXT: ; return to shader part epilog
- %shl = shl i32 %src0, 3
- %add = add i32 %shl, %src1
- %cast = bitcast i32 %add to float
- ret float %cast
-}
-
-define amdgpu_ps float @shl4_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
-; GFX9-LABEL: shl4_add_u32_vgpr1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshl_b32 s0, s0, 4
-; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: shl4_add_u32_vgpr1:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshl_b32 s0, s0, 4
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: shl4_add_u32_vgpr1:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshl_b32 s0, s0, 4
-; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX10-NEXT: ; return to shader part epilog
- %shl = shl i32 %src0, 4
- %add = add i32 %shl, %src1
- %cast = bitcast i32 %add to float
- ret float %cast
-}
-
-define amdgpu_ps float @shl5_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
-; GFX9-LABEL: shl5_add_u32_vgpr1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshl_b32 s0, s0, 5
-; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: shl5_add_u32_vgpr1:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshl_b32 s0, s0, 5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: shl5_add_u32_vgpr1:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshl_b32 s0, s0, 5
-; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX10-NEXT: ; return to shader part epilog
- %shl = shl i32 %src0, 5
- %add = add i32 %shl, %src1
- %cast = bitcast i32 %add to float
- ret float %cast
-}
-
-define amdgpu_ps <2 x i32> @s_shl1_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
-; GFX9-LABEL: s_shl1_add_u32_v2:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshl1_add_u32 s0, s0, s2
-; GFX9-NEXT: s_lshl1_add_u32 s1, s1, s3
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_shl1_add_u32_v2:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshl_b32 s0, s0, 1
-; GFX8-NEXT: s_lshl_b32 s1, s1, 1
-; GFX8-NEXT: s_add_i32 s0, s0, s2
-; GFX8-NEXT: s_add_i32 s1, s1, s3
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_shl1_add_u32_v2:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshl1_add_u32 s0, s0, s2
-; GFX10-NEXT: s_lshl1_add_u32 s1, s1, s3
-; GFX10-NEXT: ; return to shader part epilog
- %shl = shl <2 x i32> %src0, <i32 1, i32 1>
- %add = add <2 x i32> %shl, %src1
- ret <2 x i32> %add
-}
-
-define amdgpu_ps <2 x i32> @s_shl2_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
-; GFX9-LABEL: s_shl2_add_u32_v2:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshl2_add_u32 s0, s0, s2
-; GFX9-NEXT: s_lshl2_add_u32 s1, s1, s3
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_shl2_add_u32_v2:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshl_b32 s0, s0, 2
-; GFX8-NEXT: s_lshl_b32 s1, s1, 2
-; GFX8-NEXT: s_add_i32 s0, s0, s2
-; GFX8-NEXT: s_add_i32 s1, s1, s3
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_shl2_add_u32_v2:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshl2_add_u32 s0, s0, s2
-; GFX10-NEXT: s_lshl2_add_u32 s1, s1, s3
-; GFX10-NEXT: ; return to shader part epilog
- %shl = shl <2 x i32> %src0, <i32 2, i32 2>
- %add = add <2 x i32> %shl, %src1
- ret <2 x i32> %add
-}
-
-define amdgpu_ps <2 x i32> @s_shl3_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
-; GFX9-LABEL: s_shl3_add_u32_v2:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshl3_add_u32 s0, s0, s2
-; GFX9-NEXT: s_lshl3_add_u32 s1, s1, s3
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_shl3_add_u32_v2:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshl_b32 s0, s0, 3
-; GFX8-NEXT: s_lshl_b32 s1, s1, 3
-; GFX8-NEXT: s_add_i32 s0, s0, s2
-; GFX8-NEXT: s_add_i32 s1, s1, s3
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_shl3_add_u32_v2:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshl3_add_u32 s0, s0, s2
-; GFX10-NEXT: s_lshl3_add_u32 s1, s1, s3
-; GFX10-NEXT: ; return to shader part epilog
- %shl = shl <2 x i32> %src0, <i32 3, i32 3>
- %add = add <2 x i32> %shl, %src1
- ret <2 x i32> %add
-}
-
-define amdgpu_ps <2 x i32> @s_shl4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
-; GFX9-LABEL: s_shl4_add_u32_v2:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshl4_add_u32 s0, s0, s2
-; GFX9-NEXT: s_lshl4_add_u32 s1, s1, s3
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_shl4_add_u32_v2:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshl_b32 s0, s0, 4
-; GFX8-NEXT: s_lshl_b32 s1, s1, 4
-; GFX8-NEXT: s_add_i32 s0, s0, s2
-; GFX8-NEXT: s_add_i32 s1, s1, s3
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_shl4_add_u32_v2:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshl4_add_u32 s0, s0, s2
-; GFX10-NEXT: s_lshl4_add_u32 s1, s1, s3
-; GFX10-NEXT: ; return to shader part epilog
- %shl = shl <2 x i32> %src0, <i32 4, i32 4>
- %add = add <2 x i32> %shl, %src1
- ret <2 x i32> %add
-}
-
-define amdgpu_ps <2 x i32> @s_shl_2_4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
-; GFX9-LABEL: s_shl_2_4_add_u32_v2:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshl2_add_u32 s0, s0, s2
-; GFX9-NEXT: s_lshl4_add_u32 s1, s1, s3
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_shl_2_4_add_u32_v2:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshl_b32 s0, s0, 2
-; GFX8-NEXT: s_lshl_b32 s1, s1, 4
-; GFX8-NEXT: s_add_i32 s0, s0, s2
-; GFX8-NEXT: s_add_i32 s1, s1, s3
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_shl_2_4_add_u32_v2:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshl2_add_u32 s0, s0, s2
-; GFX10-NEXT: s_lshl4_add_u32 s1, s1, s3
-; GFX10-NEXT: ; return to shader part epilog
- %shl = shl <2 x i32> %src0, <i32 2, i32 4>
- %add = add <2 x i32> %shl, %src1
- ret <2 x i32> %add
-}
-
-define amdgpu_ps { i32, i32 } @s_shl4_add_u32_multi_use(i32 inreg %src0, i32 inreg %src1) {
-; GCN-LABEL: s_shl4_add_u32_multi_use:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_lshl_b32 s0, s0, 4
-; GCN-NEXT: s_add_i32 s1, s0, s1
-; GCN-NEXT: ; return to shader part epilog
- %shl = shl i32 %src0, 4
- %add = add i32 %shl, %src1
- %insert0 = insertvalue { i32, i32 } poison, i32 %shl, 0
- %insert1 = insertvalue { i32, i32 } %insert0, i32 %add, 1
- ret { i32, i32 } %insert1
-}
-
-define amdgpu_ps { i32, i32 } @s_shl3_add_u32_multi_use(i32 inreg %src0, i32 inreg %src1) {
-; GCN-LABEL: s_shl3_add_u32_multi_use:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_lshl_b32 s0, s0, 3
-; GCN-NEXT: s_add_i32 s1, s0, s1
-; GCN-NEXT: ; return to shader part epilog
- %shl = shl i32 %src0, 3
- %add = add i32 %shl, %src1
- %insert0 = insertvalue { i32, i32 } poison, i32 %shl, 0
- %insert1 = insertvalue { i32, i32 } %insert0, i32 %add, 1
- ret { i32, i32 } %insert1
-}
-
-define amdgpu_ps { i32, i32 } @s_shl2_add_u32_multi_use(i32 inreg %src0, i32 inreg %src1) {
-; GCN-LABEL: s_shl2_add_u32_multi_use:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_lshl_b32 s0, s0, 2
-; GCN-NEXT: s_add_i32 s1, s0, s1
-; GCN-NEXT: ; return to shader part epilog
- %shl = shl i32 %src0, 2
- %add = add i32 %shl, %src1
- %insert0 = insertvalue { i32, i32 } poison, i32 %shl, 0
- %insert1 = insertvalue { i32, i32 } %insert0, i32 %add, 1
- ret { i32, i32 } %insert1
-}
-
-
-define amdgpu_ps { i32, i32 } @s_shl1_add_u32_multi_use(i32 inreg %src0, i32 inreg %src1) {
-; GCN-LABEL: s_shl1_add_u32_multi_use:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_lshl_b32 s0, s0, 1
-; GCN-NEXT: s_add_i32 s1, s0, s1
-; GCN-NEXT: ; return to shader part epilog
- %shl = shl i32 %src0, 1
- %add = add i32 %shl, %src1
- %insert0 = insertvalue { i32, i32 } poison, i32 %shl, 0
- %insert1 = insertvalue { i32, i32 } %insert0, i32 %add, 1
- ret { i32, i32 } %insert1
-}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 40b5db0..39cf7b0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -31,128 +31,128 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0
; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v1
-; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v0
-; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc
+; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v0
+; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc
; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
-; CHECK-NEXT: v_trunc_f32_e32 v6, v3
-; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v6
-; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v6
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4]
-; CHECK-NEXT: v_mul_lo_u32 v3, v11, v2
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT: v_mul_hi_u32 v7, v8, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2
-; CHECK-NEXT: v_mul_lo_u32 v12, v8, v6
-; CHECK-NEXT: v_mul_lo_u32 v13, v11, v6
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12
-; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT: v_trunc_f32_e32 v3, v3
+; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3
+; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v2
+; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v3
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v11, 0
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[3:4]
+; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2
+; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[6:7]
+; CHECK-NEXT: v_mul_hi_u32 v6, v11, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT: v_mul_lo_u32 v7, v11, v8
+; CHECK-NEXT: v_mul_lo_u32 v9, v10, v8
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT: v_mul_hi_u32 v6, v11, v8
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v12, v3
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v13, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7
-; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT: v_mul_hi_u32 v7, v10, v8
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2
-; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v3, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4]
-; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v5
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v9
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v9, vcc
-; CHECK-NEXT: v_xor_b32_e32 v5, v3, v9
-; CHECK-NEXT: v_mul_lo_u32 v3, v11, v2
-; CHECK-NEXT: v_mul_lo_u32 v7, v8, v6
-; CHECK-NEXT: v_xor_b32_e32 v10, v4, v9
-; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v2
+; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v11, 0
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[3:4]
+; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v5
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v12
+; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[6:7]
+; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v12, vcc
+; CHECK-NEXT: v_xor_b32_e32 v9, v3, v12
+; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2
+; CHECK-NEXT: v_mul_lo_u32 v5, v11, v8
+; CHECK-NEXT: v_xor_b32_e32 v13, v4, v12
+; CHECK-NEXT: v_mul_hi_u32 v4, v11, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v4, v11, v6
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6
+; CHECK-NEXT: v_mul_lo_u32 v4, v10, v8
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT: v_mul_hi_u32 v5, v11, v8
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7
-; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CHECK-NEXT: v_mul_hi_u32 v5, v10, v8
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2
-; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc
-; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2
-; CHECK-NEXT: v_mul_lo_u32 v6, v5, v3
-; CHECK-NEXT: v_mul_hi_u32 v7, v5, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
-; CHECK-NEXT: v_mul_hi_u32 v8, v10, v3
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v11, v2
+; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v10, v3, vcc
+; CHECK-NEXT: v_mul_lo_u32 v4, v13, v2
+; CHECK-NEXT: v_mul_lo_u32 v5, v9, v3
+; CHECK-NEXT: v_mul_hi_u32 v6, v9, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v13, v2
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v7, v10, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT: v_mul_hi_u32 v6, v5, v3
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT: v_mul_lo_u32 v6, v13, v3
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT: v_mul_hi_u32 v5, v9, v3
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v2, v4
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v7, 0
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4]
-; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v7, v[3:4]
-; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc
-; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v2, v4
+; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v2
+; CHECK-NEXT: v_mul_hi_u32 v5, v13, v3
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v8, 0
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v5, v4
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4]
+; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v9, v2
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[4:5]
+; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v6, vcc
+; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v13, v6
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1
-; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v1
+; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5]
; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v2, v0
-; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc
+; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v4, vcc
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1
; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5]
; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v12
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v12
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v12
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc
; CHECK-NEXT: ; implicit-def: $vgpr2
; CHECK-NEXT: ; implicit-def: $vgpr4
; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -212,67 +212,67 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: s_subb_u32 s5, 0, s9
; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
-; CHECK-NEXT: v_trunc_f32_e32 v2, v1
-; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v0
-; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v2
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT: v_mul_hi_u32 v5, v3, v0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT: v_mul_lo_u32 v6, v3, v1
-; CHECK-NEXT: v_mul_lo_u32 v7, v4, v1
-; CHECK-NEXT: v_mul_hi_u32 v8, v3, v1
-; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT: v_trunc_f32_e32 v1, v1
+; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
+; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v0
+; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v1
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT: v_mul_hi_u32 v2, v6, v0
+; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT: v_mul_lo_u32 v3, v6, v4
+; CHECK-NEXT: v_mul_lo_u32 v5, v7, v4
+; CHECK-NEXT: v_mul_hi_u32 v8, v6, v4
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v0
+; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT: v_mul_hi_u32 v3, v6, v0
+; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT: v_mul_lo_u32 v2, v6, v4
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT: v_mul_hi_u32 v6, v3, v0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT: v_mul_lo_u32 v5, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v6, v4, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_mul_hi_u32 v5, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v3, v7, v4
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT: v_mul_hi_u32 v2, v6, v4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; CHECK-NEXT: v_mul_lo_u32 v2, s11, v0
; CHECK-NEXT: v_mul_lo_u32 v3, s10, v1
; CHECK-NEXT: v_mul_hi_u32 v4, s10, v0
; CHECK-NEXT: v_mul_hi_u32 v0, s11, v0
-; CHECK-NEXT: v_mul_hi_u32 v5, s11, v1
+; CHECK-NEXT: v_mov_b32_e32 v7, s11
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
@@ -285,19 +285,19 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v0, v2
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v2, v[1:2]
-; CHECK-NEXT: v_mov_b32_e32 v5, s11
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v0, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v0
+; CHECK-NEXT: v_mul_hi_u32 v3, s11, v1
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v3, v2
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v4, v[1:2]
+; CHECK-NEXT: v_mov_b32_e32 v1, s9
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, s10, v0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
-; CHECK-NEXT: v_mov_b32_e32 v3, s9
-; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v5, v1, vcc
-; CHECK-NEXT: v_sub_i32_e64 v1, s[0:1], s11, v1
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; CHECK-NEXT: v_sub_i32_e64 v3, s[0:1], s11, v4
+; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v7, v4, vcc
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2
; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
@@ -372,261 +372,257 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v5
; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8
-; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v5
-; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v8, vcc
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v5
+; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v8, vcc
; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v9
; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v4
-; GISEL-NEXT: v_trunc_f32_e32 v11, v9
-; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v11
-; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v11
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0
-; GISEL-NEXT: v_mov_b32_e32 v4, v10
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v15, v9
-; GISEL-NEXT: v_mul_hi_u32 v16, v12, v9
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11]
+; GISEL-NEXT: v_trunc_f32_e32 v9, v9
+; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v9
+; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v9
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v4, 0
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v15, v[10:11]
+; GISEL-NEXT: v_mul_lo_u32 v10, v15, v9
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v4, v[11:12]
+; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9
; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9
-; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11
+; GISEL-NEXT: v_mul_lo_u32 v11, v4, v13
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v16
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v16, v15, v10
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4
-; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v12, v15, v13
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_mul_hi_u32 v11, v4, v13
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11
-; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v4
-; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v15, v9, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0
-; GISEL-NEXT: v_mov_b32_e32 v4, v10
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5]
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT: v_mul_hi_u32 v12, v15, v13
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, v4, v9
+; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v15, v10, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v18, 0
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11]
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v15, v[10:11]
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GISEL-NEXT: v_xor_b32_e32 v13, v0, v4
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v18, v[11:12]
+; GISEL-NEXT: v_xor_b32_e32 v14, v0, v4
; GISEL-NEXT: v_mul_lo_u32 v0, v15, v9
-; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10
-; GISEL-NEXT: v_xor_b32_e32 v14, v1, v4
-; GISEL-NEXT: v_mul_hi_u32 v1, v12, v9
+; GISEL-NEXT: v_mul_lo_u32 v10, v18, v13
+; GISEL-NEXT: v_xor_b32_e32 v16, v1, v4
+; GISEL-NEXT: v_mul_hi_u32 v1, v18, v9
; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v15, v10
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10
+; GISEL-NEXT: v_mul_lo_u32 v1, v15, v13
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT: v_mul_hi_u32 v10, v18, v13
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_mul_hi_u32 v10, v15, v13
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v15, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v9, v14, v0
-; GISEL-NEXT: v_mul_lo_u32 v10, v13, v1
-; GISEL-NEXT: v_mul_hi_u32 v11, v13, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT: v_mul_lo_u32 v9, v16, v0
+; GISEL-NEXT: v_mul_lo_u32 v10, v14, v1
+; GISEL-NEXT: v_mul_hi_u32 v11, v14, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, v14, v1
+; GISEL-NEXT: v_mul_lo_u32 v11, v16, v1
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT: v_mul_hi_u32 v10, v13, v1
+; GISEL-NEXT: v_mul_hi_u32 v10, v14, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_mul_hi_u32 v1, v14, v1
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v9
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v9
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v1, v0
-; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v7
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v10, vcc
-; GISEL-NEXT: v_xor_b32_e32 v6, v6, v10
-; GISEL-NEXT: v_xor_b32_e32 v7, v7, v10
-; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6
-; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v7
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, 0
-; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6
-; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v15
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v9, v[1:2]
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v12
-; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v8, v11, v[9:10]
-; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1
-; GISEL-NEXT: v_trunc_f32_e32 v12, v10
-; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v12
-; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v1
-; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12
-; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v13, v0
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
-; GISEL-NEXT: v_mov_b32_e32 v0, v11
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v12, v[0:1]
-; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v14, v9, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v15, v[0:1]
-; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v14, v9
-; GISEL-NEXT: v_mul_lo_u32 v9, v12, v10
-; GISEL-NEXT: v_mul_lo_u32 v14, v15, v0
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v11, v8
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14
-; GISEL-NEXT: v_mul_hi_u32 v14, v15, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_hi_u32 v10, v12, v10
-; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v13, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v11, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[6:7]
-; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v13, v5
-; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v1, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v14, v5
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v8
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9]
-; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v8
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v14, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7]
-; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v0
+; GISEL-NEXT: v_mul_hi_u32 v10, v16, v1
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v13, 0
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v10, v9
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v11, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v13, v[9:10]
+; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v14, v0
+; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], v16, v11, vcc
+; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v16, v11
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5
+; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v8, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v8
+; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v10, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v11, v1, v9, s[4:5]
+; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v0, vcc
+; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v7
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v1
+; GISEL-NEXT: v_addc_u32_e64 v7, s[4:5], v7, v1, s[4:5]
+; GISEL-NEXT: v_xor_b32_e32 v6, v6, v1
+; GISEL-NEXT: v_xor_b32_e32 v7, v7, v1
+; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v6
+; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v7
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v5
+; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v9
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v0, v8, vcc
+; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1
+; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
+; GISEL-NEXT: v_trunc_f32_e32 v1, v1
+; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
+; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0
+; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v6
+; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v17, 0
+; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v7, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v18, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v21, v18, v0
+; GISEL-NEXT: v_mul_hi_u32 v22, v17, v0
+; GISEL-NEXT: v_mul_hi_u32 v23, v18, v0
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v13, v5
+; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v16, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15
+; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v0, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v17, v[8:9]
; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v8, v19, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v14, v12, v0
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v18, v1
-; GISEL-NEXT: v_mul_hi_u32 v18, v15, v0
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10
+; GISEL-NEXT: v_mul_lo_u32 v1, v17, v0
+; GISEL-NEXT: v_mul_lo_u32 v9, v18, v0
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v21, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v22
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1
+; GISEL-NEXT: v_mul_hi_u32 v8, v17, v0
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v23
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18
-; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v1
-; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v0, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v8, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v12, v[1:2]
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc
-; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v10, v[8:9]
-; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
-; GISEL-NEXT: v_xor_b32_e32 v13, v2, v5
-; GISEL-NEXT: v_mul_lo_u32 v2, v12, v0
-; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8
-; GISEL-NEXT: v_xor_b32_e32 v14, v3, v5
-; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v1
+; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v18, v0, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v14, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
+; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v15, v[1:2]
+; GISEL-NEXT: v_xor_b32_e32 v1, v10, v4
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v12, v5, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v20, v14, v[8:9]
+; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; GISEL-NEXT: v_xor_b32_e32 v11, v2, v12
+; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0
+; GISEL-NEXT: v_mul_lo_u32 v8, v14, v10
+; GISEL-NEXT: v_xor_b32_e32 v13, v3, v12
+; GISEL-NEXT: v_mul_hi_u32 v3, v14, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v12, v8
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2
-; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
+; GISEL-NEXT: v_mul_lo_u32 v3, v15, v10
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2
+; GISEL-NEXT: v_mul_hi_u32 v8, v14, v10
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
-; GISEL-NEXT: v_mul_hi_u32 v8, v12, v8
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
+; GISEL-NEXT: v_mul_hi_u32 v8, v15, v10
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0
-; GISEL-NEXT: v_mul_lo_u32 v8, v13, v2
-; GISEL-NEXT: v_mul_hi_u32 v9, v13, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
-; GISEL-NEXT: v_xor_b32_e32 v10, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v15, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0
+; GISEL-NEXT: v_mul_lo_u32 v8, v11, v2
+; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v9, v14, v2
+; GISEL-NEXT: v_mul_lo_u32 v9, v13, v2
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3
-; GISEL-NEXT: v_mul_hi_u32 v8, v13, v2
+; GISEL-NEXT: v_mul_hi_u32 v8, v11, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3
-; GISEL-NEXT: v_mul_hi_u32 v9, v14, v2
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v0
-; GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v8, v[0:1]
+; GISEL-NEXT: v_mul_hi_u32 v8, v13, v2
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v10, 0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v0, v[3:4]
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v11, v[8:9]
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v13, v2
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc
-; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v10, v[8:9]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2
+; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v3, vcc
+; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v13, v3
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v7
; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5]
-; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v2, v6
-; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5]
+; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v2, v6
+; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v6
; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v7
-; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v7
+; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v8, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
-; GISEL-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v5
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v5
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_srem_v2i64:
@@ -651,128 +647,128 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_xor_b32_e32 v1, v2, v1
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v0
; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1
-; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v0
-; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc
+; CGP-NEXT: v_sub_i32_e32 v16, vcc, 0, v0
+; CGP-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc
; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
-; CGP-NEXT: v_trunc_f32_e32 v4, v3
-; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v5, v2
-; CGP-NEXT: v_cvt_u32_f32_e32 v14, v4
-; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4]
-; CGP-NEXT: v_mul_hi_u32 v15, v5, v2
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4]
-; CGP-NEXT: v_mul_lo_u32 v4, v14, v2
+; CGP-NEXT: v_trunc_f32_e32 v3, v3
+; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3
+; CGP-NEXT: v_cvt_u32_f32_e32 v15, v2
+; CGP-NEXT: v_cvt_u32_f32_e32 v14, v3
+; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v15, 0
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[3:4]
+; CGP-NEXT: v_mul_lo_u32 v3, v14, v2
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v15, v[4:5]
+; CGP-NEXT: v_mul_hi_u32 v4, v15, v2
; CGP-NEXT: v_mul_hi_u32 v2, v14, v2
-; CGP-NEXT: v_mul_lo_u32 v16, v5, v3
-; CGP-NEXT: v_mul_lo_u32 v17, v14, v3
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v16
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15
-; CGP-NEXT: v_mul_hi_u32 v15, v5, v3
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v16, v4
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v17, v2
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15
-; CGP-NEXT: v_mul_hi_u32 v3, v14, v3
+; CGP-NEXT: v_mul_lo_u32 v5, v15, v12
+; CGP-NEXT: v_mul_lo_u32 v13, v14, v12
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v15, v12
+; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v2
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; CGP-NEXT: v_mul_hi_u32 v5, v14, v12
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v2
; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v3, vcc
-; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4]
-; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; CGP-NEXT: v_mul_hi_u32 v15, v5, v2
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4]
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v12
-; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v12, vcc
-; CGP-NEXT: v_xor_b32_e32 v11, v4, v12
-; CGP-NEXT: v_mul_lo_u32 v4, v14, v2
-; CGP-NEXT: v_mul_lo_u32 v13, v5, v3
+; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v15, 0
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[3:4]
+; CGP-NEXT: v_ashrrev_i32_e32 v16, 31, v11
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v16
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v15, v[4:5]
+; CGP-NEXT: v_addc_u32_e32 v4, vcc, v11, v16, vcc
+; CGP-NEXT: v_xor_b32_e32 v13, v3, v16
+; CGP-NEXT: v_mul_lo_u32 v3, v14, v2
+; CGP-NEXT: v_mul_lo_u32 v5, v15, v12
+; CGP-NEXT: v_xor_b32_e32 v17, v4, v16
+; CGP-NEXT: v_mul_hi_u32 v4, v15, v2
; CGP-NEXT: v_mul_hi_u32 v2, v14, v2
-; CGP-NEXT: v_xor_b32_e32 v10, v10, v12
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v4, v14, v12
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT: v_mul_hi_u32 v5, v15, v12
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v15, v14, v3
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_mul_hi_u32 v13, v5, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v14, v12
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; CGP-NEXT: v_mul_hi_u32 v3, v14, v3
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2
; CGP-NEXT: v_addc_u32_e32 v3, vcc, v14, v3, vcc
-; CGP-NEXT: v_mul_lo_u32 v4, v10, v2
-; CGP-NEXT: v_mul_lo_u32 v5, v11, v3
-; CGP-NEXT: v_mul_hi_u32 v13, v11, v2
-; CGP-NEXT: v_mul_hi_u32 v2, v10, v2
-; CGP-NEXT: v_mul_hi_u32 v14, v10, v3
+; CGP-NEXT: v_mul_lo_u32 v4, v17, v2
+; CGP-NEXT: v_mul_lo_u32 v5, v13, v3
+; CGP-NEXT: v_mul_hi_u32 v10, v13, v2
+; CGP-NEXT: v_mul_hi_u32 v2, v17, v2
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v10, v3
+; CGP-NEXT: v_mul_lo_u32 v10, v17, v3
; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CGP-NEXT: v_mul_hi_u32 v5, v11, v3
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v5, v13, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v2
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v2, v4
-; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v13, 0
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4]
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v11, v2
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v13, v[3:4]
-; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc
-; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v2, v4
+; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v2
+; CGP-NEXT: v_mul_hi_u32 v5, v17, v3
+; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v12, 0
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v5, v4
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v10, v[3:4]
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v13, v2
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v1, v12, v[4:5]
+; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v17, v10, vcc
+; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v17, v10
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v1
+; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[4:5]
; CGP-NEXT: v_sub_i32_e32 v10, vcc, v2, v0
-; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v3, vcc
+; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v4, vcc
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v0
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v1
-; CGP-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CGP-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v10, v0
-; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[4:5]
; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
; CGP-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v12
-; CGP-NEXT: v_xor_b32_e32 v1, v1, v12
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v12
-; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CGP-NEXT: v_xor_b32_e32 v0, v0, v16
+; CGP-NEXT: v_xor_b32_e32 v1, v1, v16
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v16
+; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v16, vcc
; CGP-NEXT: ; implicit-def: $vgpr4
; CGP-NEXT: ; implicit-def: $vgpr10
; CGP-NEXT: .LBB2_2: ; %Flow1
@@ -820,128 +816,128 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_xor_b32_e32 v3, v4, v3
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v5, v3
-; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v2
-; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v3, vcc
+; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v2
+; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc
; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4
; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; CGP-NEXT: v_trunc_f32_e32 v6, v5
-; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6
-; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v12, v6
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6]
-; CGP-NEXT: v_mul_hi_u32 v13, v7, v4
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6]
-; CGP-NEXT: v_mul_lo_u32 v6, v12, v4
+; CGP-NEXT: v_trunc_f32_e32 v5, v5
+; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5
+; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4
+; CGP-NEXT: v_cvt_u32_f32_e32 v12, v5
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v13, 0
+; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[5:6]
+; CGP-NEXT: v_mul_lo_u32 v5, v12, v4
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v13, v[6:7]
+; CGP-NEXT: v_mul_hi_u32 v6, v13, v4
; CGP-NEXT: v_mul_hi_u32 v4, v12, v4
-; CGP-NEXT: v_mul_lo_u32 v14, v7, v5
-; CGP-NEXT: v_mul_lo_u32 v15, v12, v5
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13
-; CGP-NEXT: v_mul_hi_u32 v13, v7, v5
-; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v6
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_mul_hi_u32 v5, v12, v5
+; CGP-NEXT: v_mul_lo_u32 v7, v13, v10
+; CGP-NEXT: v_mul_lo_u32 v11, v12, v10
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; CGP-NEXT: v_mul_hi_u32 v6, v13, v10
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v13, v6
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v4
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CGP-NEXT: v_mul_hi_u32 v7, v12, v10
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v4
; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6]
-; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v9
-; CGP-NEXT: v_mul_hi_u32 v13, v7, v4
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6]
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v10
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v10, vcc
-; CGP-NEXT: v_xor_b32_e32 v9, v6, v10
-; CGP-NEXT: v_mul_lo_u32 v6, v12, v4
-; CGP-NEXT: v_mul_lo_u32 v11, v7, v5
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v13, 0
+; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[5:6]
+; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v14
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v13, v[6:7]
+; CGP-NEXT: v_addc_u32_e32 v6, vcc, v9, v14, vcc
+; CGP-NEXT: v_xor_b32_e32 v11, v5, v14
+; CGP-NEXT: v_mul_lo_u32 v5, v12, v4
+; CGP-NEXT: v_mul_lo_u32 v7, v13, v10
+; CGP-NEXT: v_xor_b32_e32 v15, v6, v14
+; CGP-NEXT: v_mul_hi_u32 v6, v13, v4
; CGP-NEXT: v_mul_hi_u32 v4, v12, v4
-; CGP-NEXT: v_xor_b32_e32 v8, v8, v10
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v6, v12, v10
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT: v_mul_hi_u32 v7, v13, v10
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v12, v5
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6
-; CGP-NEXT: v_mul_hi_u32 v11, v7, v5
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; CGP-NEXT: v_mul_hi_u32 v7, v12, v10
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11
-; CGP-NEXT: v_mul_hi_u32 v5, v12, v5
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4
; CGP-NEXT: v_addc_u32_e32 v5, vcc, v12, v5, vcc
-; CGP-NEXT: v_mul_lo_u32 v6, v8, v4
-; CGP-NEXT: v_mul_lo_u32 v7, v9, v5
-; CGP-NEXT: v_mul_hi_u32 v11, v9, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v8, v4
-; CGP-NEXT: v_mul_hi_u32 v12, v8, v5
+; CGP-NEXT: v_mul_lo_u32 v6, v15, v4
+; CGP-NEXT: v_mul_lo_u32 v7, v11, v5
+; CGP-NEXT: v_mul_hi_u32 v8, v11, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v15, v4
; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v11, v8, v5
+; CGP-NEXT: v_mul_lo_u32 v8, v15, v5
; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT: v_mul_hi_u32 v7, v9, v5
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v7, v11, v5
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v6
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v11, 0
-; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6]
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, v9, v4
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v11, v[5:6]
-; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v8, v5, vcc
-; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v4, v6
+; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v4
+; CGP-NEXT: v_mul_hi_u32 v7, v15, v5
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v7, v6
+; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6]
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, v11, v4
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v10, v[6:7]
+; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v15, v8, vcc
+; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v15, v8
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3
-; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v3
+; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc
; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5]
; CGP-NEXT: v_sub_i32_e32 v8, vcc, v4, v2
-; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc
+; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2
-; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
-; CGP-NEXT: v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v14
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v14
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v14
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc
; CGP-NEXT: ; implicit-def: $vgpr6
; CGP-NEXT: ; implicit-def: $vgpr8
; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -977,82 +973,82 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000
; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT: v_mov_b32_e32 v6, 0xfffff000
+; CHECK-NEXT: v_mov_b32_e32 v9, 0xfffff000
; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
-; CHECK-NEXT: v_trunc_f32_e32 v4, v3
-; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3
-; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3
-; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3
-; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT: v_trunc_f32_e32 v3, v3
+; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3
+; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2
+; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v3
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT: v_mul_lo_u32 v5, v8, v6
+; CHECK-NEXT: v_mul_lo_u32 v7, v10, v6
+; CHECK-NEXT: v_mul_hi_u32 v11, v8, v6
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2
-; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6
-; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3
-; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6
-; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT: v_mul_hi_u32 v5, v10, v6
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2
+; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT: v_xor_b32_e32 v4, v0, v9
+; CHECK-NEXT: v_mul_lo_u32 v0, v10, v2
+; CHECK-NEXT: v_mul_lo_u32 v3, v8, v6
+; CHECK-NEXT: v_xor_b32_e32 v5, v1, v9
+; CHECK-NEXT: v_mul_hi_u32 v1, v8, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT: v_mul_lo_u32 v1, v10, v6
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT: v_mul_hi_u32 v3, v8, v6
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v10, v6
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0
; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1
; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0
-; CHECK-NEXT: v_mov_b32_e32 v5, 0x1000
+; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0
+; CHECK-NEXT: v_mov_b32_e32 v6, 0x1000
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1
+; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0
@@ -1060,39 +1056,39 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT: v_mul_hi_u32 v7, v9, v1
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2]
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v0, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT: v_mul_hi_u32 v7, v5, v1
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v3
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2]
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0
-; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc
-; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5
-; CHECK-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc
-; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5
+; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc
+; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2
+; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v6
+; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6
+; CHECK-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc
+; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v4, v6
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2
-; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
+; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v2, vcc
; CHECK-NEXT: v_cndmask_b32_e64 v3, -1, v3, s[4:5]
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = srem i64 %num, 4096
ret i64 %result
@@ -1112,71 +1108,69 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: s_subb_u32 s6, 0, 0
; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT: v_trunc_f32_e32 v7, v5
-; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7
-; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v7
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0
-; GISEL-NEXT: v_mov_b32_e32 v7, v5
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8]
-; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11]
-; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4
-; GISEL-NEXT: v_mul_hi_u32 v11, v8, v4
-; GISEL-NEXT: v_mul_lo_u32 v7, v8, v13
-; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7
+; GISEL-NEXT: v_trunc_f32_e32 v5, v5
+; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v5
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6]
+; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4
+; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[9:10]
+; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4
+; GISEL-NEXT: v_mul_lo_u32 v9, v7, v13
+; GISEL-NEXT: v_mul_lo_u32 v4, v8, v13
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
-; GISEL-NEXT: v_mul_hi_u32 v14, v8, v13
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v8, v4
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc
-; GISEL-NEXT: v_mov_b32_e32 v4, v14
-; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13
-; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v6, v17, v[14:15]
+; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13
+; GISEL-NEXT: v_mul_hi_u32 v18, v4, v13
+; GISEL-NEXT: v_mul_hi_u32 v19, v17, v13
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v4, v[15:16]
; GISEL-NEXT: s_mov_b32 s6, 1
; GISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GISEL-NEXT: v_mul_lo_u32 v7, v16, v14
+; GISEL-NEXT: v_mul_lo_u32 v14, v4, v13
+; GISEL-NEXT: v_mul_hi_u32 v15, v4, v13
; GISEL-NEXT: s_subb_u32 s6, 0, 0
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT: v_mul_hi_u32 v7, v16, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_mul_hi_u32 v7, v17, v13
-; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v7, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v7
-; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; GISEL-NEXT: v_xor_b32_e32 v18, v0, v7
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14
-; GISEL-NEXT: v_xor_b32_e32 v19, v1, v7
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v9
+; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13
+; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v19
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v9
+; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v14
+; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0
; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1
@@ -1195,122 +1189,120 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13
-; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14]
-; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0
-; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13
-; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; GISEL-NEXT: v_mul_hi_u32 v1, v19, v1
+; GISEL-NEXT: v_add_i32_e32 v17, vcc, v0, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v1, v0
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v17, 0
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v17, v[13:14]
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v18, v0
+; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v15
+; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v19, v15, vcc
; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4
-; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
+; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v4
+; GISEL-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v4
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc
-; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1]
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4
-; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT: v_mul_lo_u32 v18, v8, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
+; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6]
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[0:1]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v4
+; GISEL-NEXT: v_mul_lo_u32 v5, v7, v13
+; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v19, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v18, v19, v1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v5, v8, v13
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GISEL-NEXT: v_mul_hi_u32 v1, v7, v13
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; GISEL-NEXT: v_mul_hi_u32 v10, v8, v13
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_mul_hi_u32 v10, v8, v0
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v1
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
-; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc
-; GISEL-NEXT: v_xor_b32_e32 v1, v10, v7
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
-; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; GISEL-NEXT: v_xor_b32_e32 v12, v2, v10
-; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0
-; GISEL-NEXT: v_mul_lo_u32 v6, v8, v5
-; GISEL-NEXT: v_xor_b32_e32 v13, v3, v10
-; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v0
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v8, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v14, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2]
+; GISEL-NEXT: v_xor_b32_e32 v1, v5, v9
+; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[7:8]
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc
+; GISEL-NEXT: v_xor_b32_e32 v14, v2, v13
+; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0
+; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5
+; GISEL-NEXT: v_xor_b32_e32 v15, v3, v13
+; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT: v_mul_hi_u32 v6, v8, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5
+; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0
-; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2
-; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT: v_xor_b32_e32 v8, v11, v7
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0
+; GISEL-NEXT: v_mul_lo_u32 v5, v14, v2
+; GISEL-NEXT: v_mul_hi_u32 v6, v14, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0
+; GISEL-NEXT: v_xor_b32_e32 v7, v12, v9
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2
+; GISEL-NEXT: v_mul_lo_u32 v6, v15, v2
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2
+; GISEL-NEXT: v_mul_hi_u32 v5, v14, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3
-; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v0
-; GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[0:1]
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
+; GISEL-NEXT: v_mul_hi_u32 v5, v15, v2
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v9
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v9, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v7
+; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v15, v7, vcc
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -1330,10 +1322,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, v2, v13
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v13
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v13
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_srem_v2i64_pow2k_denom:
@@ -1346,27 +1338,26 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4
; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; CGP-NEXT: v_trunc_f32_e32 v8, v5
-; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8
+; CGP-NEXT: v_trunc_f32_e32 v5, v5
+; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5
; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8
+; CGP-NEXT: v_cvt_u32_f32_e32 v8, v5
; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
-; CGP-NEXT: v_mov_b32_e32 v9, v5
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; CGP-NEXT: v_mul_hi_u32 v11, v7, v4
-; CGP-NEXT: v_mul_hi_u32 v12, v8, v4
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10]
-; CGP-NEXT: v_mul_lo_u32 v10, v8, v4
-; CGP-NEXT: v_mul_lo_u32 v4, v7, v9
-; CGP-NEXT: v_mul_lo_u32 v13, v8, v9
-; CGP-NEXT: v_mul_hi_u32 v14, v7, v9
-; CGP-NEXT: v_mul_hi_u32 v9, v8, v9
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6]
+; CGP-NEXT: v_mul_hi_u32 v11, v8, v4
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v7, v[9:10]
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v4
+; CGP-NEXT: v_mul_hi_u32 v10, v7, v4
+; CGP-NEXT: v_mul_lo_u32 v4, v7, v12
+; CGP-NEXT: v_mul_lo_u32 v13, v8, v12
+; CGP-NEXT: v_mul_hi_u32 v14, v7, v12
+; CGP-NEXT: v_mul_hi_u32 v12, v8, v12
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v11
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
@@ -1374,41 +1365,40 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
-; CGP-NEXT: v_mov_b32_e32 v4, v14
-; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; CGP-NEXT: v_mul_lo_u32 v4, v17, v13
-; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
-; CGP-NEXT: v_mul_lo_u32 v9, v16, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT: v_mul_hi_u32 v9, v16, v13
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_hi_u32 v9, v17, v13
-; CGP-NEXT: v_mul_lo_u32 v13, v17, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v15, v16, v14
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; CGP-NEXT: v_addc_u32_e32 v16, vcc, v8, v12, vcc
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v6, v4, 0
+; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v16, v[13:14]
+; CGP-NEXT: v_mul_lo_u32 v17, v16, v12
+; CGP-NEXT: v_mul_hi_u32 v18, v4, v12
+; CGP-NEXT: v_mul_hi_u32 v19, v16, v12
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v4, v[14:15]
+; CGP-NEXT: v_mul_lo_u32 v13, v4, v12
+; CGP-NEXT: v_mul_hi_u32 v15, v4, v12
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v17, v13
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15
-; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9
-; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9
-; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; CGP-NEXT: v_xor_b32_e32 v18, v0, v9
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v17, v14
-; CGP-NEXT: v_xor_b32_e32 v19, v1, v9
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT: v_mul_lo_u32 v14, v16, v12
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19
+; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15
+; CGP-NEXT: v_mul_hi_u32 v17, v16, v12
+; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc
+; CGP-NEXT: v_xor_b32_e32 v18, v0, v12
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v13
+; CGP-NEXT: v_xor_b32_e32 v19, v1, v12
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v17, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc
; CGP-NEXT: v_mul_lo_u32 v13, v19, v0
; CGP-NEXT: v_mul_lo_u32 v14, v18, v1
; CGP-NEXT: v_mul_hi_u32 v15, v18, v0
@@ -1426,119 +1416,118 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v13
+; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; CGP-NEXT: v_mul_hi_u32 v15, v19, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; CGP-NEXT: v_sub_i32_e32 v14, vcc, v18, v0
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v0
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v14
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2]
+; CGP-NEXT: v_sub_i32_e32 v15, vcc, v18, v0
; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13
-; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v19, v13, vcc
; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4
-; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4
+; CGP-NEXT: v_sub_i32_e32 v18, vcc, v15, v4
+; CGP-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
-; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
-; CGP-NEXT: v_mov_b32_e32 v0, v5
-; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1]
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
-; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v18, vcc
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v16, v4
-; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; CGP-NEXT: v_mul_lo_u32 v5, v7, v0
-; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11
-; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v10, v8, v0
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1
-; CGP-NEXT: v_mul_hi_u32 v5, v7, v0
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_mul_hi_u32 v0, v8, v0
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v16, vcc
-; CGP-NEXT: v_xor_b32_e32 v11, v5, v9
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v17, vcc
-; CGP-NEXT: v_xor_b32_e32 v1, v10, v9
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
-; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; CGP-NEXT: v_xor_b32_e32 v12, v2, v10
-; CGP-NEXT: v_mul_lo_u32 v2, v8, v0
-; CGP-NEXT: v_mul_lo_u32 v6, v7, v5
-; CGP-NEXT: v_xor_b32_e32 v13, v3, v10
-; CGP-NEXT: v_mul_hi_u32 v3, v7, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16
+; CGP-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
+; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6]
+; CGP-NEXT: v_sub_i32_e32 v5, vcc, v18, v4
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[0:1]
+; CGP-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v19, vcc
+; CGP-NEXT: v_mul_lo_u32 v1, v7, v13
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
+; CGP-NEXT: v_cndmask_b32_e32 v0, v18, v5, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v5, v19, v21, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; CGP-NEXT: v_cndmask_b32_e32 v14, v15, v0, vcc
+; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v9, v1
+; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v0, v10
+; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v13
+; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0
+; CGP-NEXT: v_mul_hi_u32 v1, v7, v13
+; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9
+; CGP-NEXT: v_mul_hi_u32 v10, v8, v13
+; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0
+; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1
+; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1
+; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v7, v0
+; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v1, s[4:5]
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0
+; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc
+; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v3
+; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v10, v[1:2]
+; CGP-NEXT: v_xor_b32_e32 v1, v5, v12
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v9, v[7:8]
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc
+; CGP-NEXT: v_xor_b32_e32 v7, v2, v13
+; CGP-NEXT: v_mul_lo_u32 v2, v10, v0
+; CGP-NEXT: v_mul_lo_u32 v6, v9, v5
+; CGP-NEXT: v_xor_b32_e32 v8, v3, v13
+; CGP-NEXT: v_mul_hi_u32 v3, v9, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v10, v0
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v3, v8, v5
+; CGP-NEXT: v_mul_lo_u32 v3, v10, v5
; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT: v_mul_hi_u32 v6, v7, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v9, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v8, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v10, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0
-; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; CGP-NEXT: v_mul_lo_u32 v5, v13, v3
-; CGP-NEXT: v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT: v_mul_hi_u32 v7, v12, v3
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9
-; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v0
+; CGP-NEXT: v_addc_u32_e32 v2, vcc, v10, v2, vcc
+; CGP-NEXT: v_mul_lo_u32 v5, v8, v3
+; CGP-NEXT: v_mul_lo_u32 v6, v7, v2
+; CGP-NEXT: v_xor_b32_e32 v11, v14, v12
+; CGP-NEXT: v_mul_hi_u32 v9, v7, v3
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v12
+; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v7, v13, v2
-; CGP-NEXT: v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v2
+; CGP-NEXT: v_mul_hi_u32 v3, v8, v3
; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_mul_hi_u32 v6, v12, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v6, v7, v2
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; CGP-NEXT: v_mul_hi_u32 v7, v13, v2
-; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4]
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v3, v5
+; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v9, v8, v2
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v3
+; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v6
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4]
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2
+; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc
+; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5
; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v4
; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -1558,10 +1547,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; CGP-NEXT: v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v13
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v13
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = srem <2 x i64> %num, <i64 4096, i64 4096>
ret <2 x i64> %result
@@ -1573,82 +1562,82 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb
; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT: v_mov_b32_e32 v6, 0xffed2705
+; CHECK-NEXT: v_mov_b32_e32 v9, 0xffed2705
; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
-; CHECK-NEXT: v_trunc_f32_e32 v4, v3
-; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3
-; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3
-; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3
-; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT: v_trunc_f32_e32 v3, v3
+; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3
+; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2
+; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v3
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT: v_mul_lo_u32 v5, v8, v6
+; CHECK-NEXT: v_mul_lo_u32 v7, v10, v6
+; CHECK-NEXT: v_mul_hi_u32 v11, v8, v6
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2
-; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6
-; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3
-; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6
-; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT: v_mul_hi_u32 v5, v10, v6
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2
+; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT: v_xor_b32_e32 v4, v0, v9
+; CHECK-NEXT: v_mul_lo_u32 v0, v10, v2
+; CHECK-NEXT: v_mul_lo_u32 v3, v8, v6
+; CHECK-NEXT: v_xor_b32_e32 v5, v1, v9
+; CHECK-NEXT: v_mul_hi_u32 v1, v8, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT: v_mul_lo_u32 v1, v10, v6
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT: v_mul_hi_u32 v3, v8, v6
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v10, v6
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0
; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1
; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0
-; CHECK-NEXT: v_mov_b32_e32 v5, 0x12d8fb
+; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0
+; CHECK-NEXT: v_mov_b32_e32 v6, 0x12d8fb
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1
+; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0
@@ -1656,39 +1645,39 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT: v_mul_hi_u32 v7, v9, v1
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2]
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v0, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT: v_mul_hi_u32 v7, v5, v1
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v3
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2]
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0
-; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc
-; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5
-; CHECK-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc
-; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5
+; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc
+; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2
+; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v6
+; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6
+; CHECK-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc
+; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v4, v6
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2
-; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
+; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v2, vcc
; CHECK-NEXT: v_cndmask_b32_e64 v3, -1, v3, s[4:5]
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = srem i64 %num, 1235195
ret i64 %result
@@ -1708,71 +1697,69 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: s_subb_u32 s6, 0, 0
; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT: v_trunc_f32_e32 v7, v5
-; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7
-; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v7
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0
-; GISEL-NEXT: v_mov_b32_e32 v7, v5
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8]
-; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11]
-; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4
-; GISEL-NEXT: v_mul_hi_u32 v11, v8, v4
-; GISEL-NEXT: v_mul_lo_u32 v7, v8, v13
-; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7
+; GISEL-NEXT: v_trunc_f32_e32 v5, v5
+; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v5
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6]
+; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4
+; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[9:10]
+; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4
+; GISEL-NEXT: v_mul_lo_u32 v9, v7, v13
+; GISEL-NEXT: v_mul_lo_u32 v4, v8, v13
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
-; GISEL-NEXT: v_mul_hi_u32 v14, v8, v13
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v8, v4
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc
-; GISEL-NEXT: v_mov_b32_e32 v4, v14
-; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13
-; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v6, v17, v[14:15]
+; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13
+; GISEL-NEXT: v_mul_hi_u32 v18, v4, v13
+; GISEL-NEXT: v_mul_hi_u32 v19, v17, v13
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v4, v[15:16]
; GISEL-NEXT: s_mov_b32 s6, 1
; GISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GISEL-NEXT: v_mul_lo_u32 v7, v16, v14
+; GISEL-NEXT: v_mul_lo_u32 v14, v4, v13
+; GISEL-NEXT: v_mul_hi_u32 v15, v4, v13
; GISEL-NEXT: s_subb_u32 s6, 0, 0
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT: v_mul_hi_u32 v7, v16, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_mul_hi_u32 v7, v17, v13
-; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v7, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v7
-; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; GISEL-NEXT: v_xor_b32_e32 v18, v0, v7
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14
-; GISEL-NEXT: v_xor_b32_e32 v19, v1, v7
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v9
+; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13
+; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v19
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v9
+; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v14
+; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0
; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1
@@ -1791,122 +1778,120 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13
-; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14]
-; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0
-; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13
-; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; GISEL-NEXT: v_mul_hi_u32 v1, v19, v1
+; GISEL-NEXT: v_add_i32_e32 v17, vcc, v0, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v1, v0
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v17, 0
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v17, v[13:14]
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v18, v0
+; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v15
+; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v19, v15, vcc
; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4
-; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
+; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v4
+; GISEL-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v4
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc
-; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1]
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4
-; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT: v_mul_lo_u32 v18, v8, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
+; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6]
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[0:1]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v4
+; GISEL-NEXT: v_mul_lo_u32 v5, v7, v13
+; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v19, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v18, v19, v1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v5, v8, v13
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GISEL-NEXT: v_mul_hi_u32 v1, v7, v13
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; GISEL-NEXT: v_mul_hi_u32 v10, v8, v13
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_mul_hi_u32 v10, v8, v0
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v1
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
-; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc
-; GISEL-NEXT: v_xor_b32_e32 v1, v10, v7
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
-; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; GISEL-NEXT: v_xor_b32_e32 v12, v2, v10
-; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0
-; GISEL-NEXT: v_mul_lo_u32 v6, v8, v5
-; GISEL-NEXT: v_xor_b32_e32 v13, v3, v10
-; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v0
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v8, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v14, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2]
+; GISEL-NEXT: v_xor_b32_e32 v1, v5, v9
+; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[7:8]
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc
+; GISEL-NEXT: v_xor_b32_e32 v14, v2, v13
+; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0
+; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5
+; GISEL-NEXT: v_xor_b32_e32 v15, v3, v13
+; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT: v_mul_hi_u32 v6, v8, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5
+; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0
-; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2
-; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT: v_xor_b32_e32 v8, v11, v7
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0
+; GISEL-NEXT: v_mul_lo_u32 v5, v14, v2
+; GISEL-NEXT: v_mul_hi_u32 v6, v14, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0
+; GISEL-NEXT: v_xor_b32_e32 v7, v12, v9
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2
+; GISEL-NEXT: v_mul_lo_u32 v6, v15, v2
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2
+; GISEL-NEXT: v_mul_hi_u32 v5, v14, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3
-; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v0
-; GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[0:1]
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
+; GISEL-NEXT: v_mul_hi_u32 v5, v15, v2
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v9
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v9, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v7
+; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v15, v7, vcc
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -1926,10 +1911,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, v2, v13
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v13
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v13
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_srem_v2i64_oddk_denom:
@@ -1942,27 +1927,26 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4
; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; CGP-NEXT: v_trunc_f32_e32 v8, v5
-; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8
+; CGP-NEXT: v_trunc_f32_e32 v5, v5
+; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5
; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8
+; CGP-NEXT: v_cvt_u32_f32_e32 v8, v5
; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
-; CGP-NEXT: v_mov_b32_e32 v9, v5
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; CGP-NEXT: v_mul_hi_u32 v11, v7, v4
-; CGP-NEXT: v_mul_hi_u32 v12, v8, v4
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10]
-; CGP-NEXT: v_mul_lo_u32 v10, v8, v4
-; CGP-NEXT: v_mul_lo_u32 v4, v7, v9
-; CGP-NEXT: v_mul_lo_u32 v13, v8, v9
-; CGP-NEXT: v_mul_hi_u32 v14, v7, v9
-; CGP-NEXT: v_mul_hi_u32 v9, v8, v9
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6]
+; CGP-NEXT: v_mul_hi_u32 v11, v8, v4
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v7, v[9:10]
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v4
+; CGP-NEXT: v_mul_hi_u32 v10, v7, v4
+; CGP-NEXT: v_mul_lo_u32 v4, v7, v12
+; CGP-NEXT: v_mul_lo_u32 v13, v8, v12
+; CGP-NEXT: v_mul_hi_u32 v14, v7, v12
+; CGP-NEXT: v_mul_hi_u32 v12, v8, v12
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v11
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
@@ -1970,41 +1954,40 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
-; CGP-NEXT: v_mov_b32_e32 v4, v14
-; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; CGP-NEXT: v_mul_lo_u32 v4, v17, v13
-; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
-; CGP-NEXT: v_mul_lo_u32 v9, v16, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT: v_mul_hi_u32 v9, v16, v13
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_hi_u32 v9, v17, v13
-; CGP-NEXT: v_mul_lo_u32 v13, v17, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v15, v16, v14
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; CGP-NEXT: v_addc_u32_e32 v16, vcc, v8, v12, vcc
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v6, v4, 0
+; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v16, v[13:14]
+; CGP-NEXT: v_mul_lo_u32 v17, v16, v12
+; CGP-NEXT: v_mul_hi_u32 v18, v4, v12
+; CGP-NEXT: v_mul_hi_u32 v19, v16, v12
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v4, v[14:15]
+; CGP-NEXT: v_mul_lo_u32 v13, v4, v12
+; CGP-NEXT: v_mul_hi_u32 v15, v4, v12
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v17, v13
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15
-; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9
-; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9
-; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; CGP-NEXT: v_xor_b32_e32 v18, v0, v9
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v17, v14
-; CGP-NEXT: v_xor_b32_e32 v19, v1, v9
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT: v_mul_lo_u32 v14, v16, v12
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19
+; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15
+; CGP-NEXT: v_mul_hi_u32 v17, v16, v12
+; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc
+; CGP-NEXT: v_xor_b32_e32 v18, v0, v12
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v13
+; CGP-NEXT: v_xor_b32_e32 v19, v1, v12
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v17, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc
; CGP-NEXT: v_mul_lo_u32 v13, v19, v0
; CGP-NEXT: v_mul_lo_u32 v14, v18, v1
; CGP-NEXT: v_mul_hi_u32 v15, v18, v0
@@ -2022,119 +2005,118 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v13
+; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; CGP-NEXT: v_mul_hi_u32 v15, v19, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; CGP-NEXT: v_sub_i32_e32 v14, vcc, v18, v0
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v0
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v14
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2]
+; CGP-NEXT: v_sub_i32_e32 v15, vcc, v18, v0
; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13
-; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v19, v13, vcc
; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4
-; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4
+; CGP-NEXT: v_sub_i32_e32 v18, vcc, v15, v4
+; CGP-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
-; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
-; CGP-NEXT: v_mov_b32_e32 v0, v5
-; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1]
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
-; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v18, vcc
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v16, v4
-; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; CGP-NEXT: v_mul_lo_u32 v5, v7, v0
-; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11
-; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v10, v8, v0
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1
-; CGP-NEXT: v_mul_hi_u32 v5, v7, v0
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_mul_hi_u32 v0, v8, v0
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v16, vcc
-; CGP-NEXT: v_xor_b32_e32 v11, v5, v9
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v17, vcc
-; CGP-NEXT: v_xor_b32_e32 v1, v10, v9
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
-; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; CGP-NEXT: v_xor_b32_e32 v12, v2, v10
-; CGP-NEXT: v_mul_lo_u32 v2, v8, v0
-; CGP-NEXT: v_mul_lo_u32 v6, v7, v5
-; CGP-NEXT: v_xor_b32_e32 v13, v3, v10
-; CGP-NEXT: v_mul_hi_u32 v3, v7, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16
+; CGP-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
+; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6]
+; CGP-NEXT: v_sub_i32_e32 v5, vcc, v18, v4
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[0:1]
+; CGP-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v19, vcc
+; CGP-NEXT: v_mul_lo_u32 v1, v7, v13
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
+; CGP-NEXT: v_cndmask_b32_e32 v0, v18, v5, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v5, v19, v21, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; CGP-NEXT: v_cndmask_b32_e32 v14, v15, v0, vcc
+; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v9, v1
+; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v0, v10
+; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v13
+; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0
+; CGP-NEXT: v_mul_hi_u32 v1, v7, v13
+; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9
+; CGP-NEXT: v_mul_hi_u32 v10, v8, v13
+; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0
+; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1
+; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1
+; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v7, v0
+; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v1, s[4:5]
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0
+; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc
+; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v3
+; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v10, v[1:2]
+; CGP-NEXT: v_xor_b32_e32 v1, v5, v12
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v9, v[7:8]
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc
+; CGP-NEXT: v_xor_b32_e32 v7, v2, v13
+; CGP-NEXT: v_mul_lo_u32 v2, v10, v0
+; CGP-NEXT: v_mul_lo_u32 v6, v9, v5
+; CGP-NEXT: v_xor_b32_e32 v8, v3, v13
+; CGP-NEXT: v_mul_hi_u32 v3, v9, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v10, v0
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v3, v8, v5
+; CGP-NEXT: v_mul_lo_u32 v3, v10, v5
; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT: v_mul_hi_u32 v6, v7, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v9, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v8, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v10, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0
-; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; CGP-NEXT: v_mul_lo_u32 v5, v13, v3
-; CGP-NEXT: v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT: v_mul_hi_u32 v7, v12, v3
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9
-; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v0
+; CGP-NEXT: v_addc_u32_e32 v2, vcc, v10, v2, vcc
+; CGP-NEXT: v_mul_lo_u32 v5, v8, v3
+; CGP-NEXT: v_mul_lo_u32 v6, v7, v2
+; CGP-NEXT: v_xor_b32_e32 v11, v14, v12
+; CGP-NEXT: v_mul_hi_u32 v9, v7, v3
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v12
+; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v7, v13, v2
-; CGP-NEXT: v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v2
+; CGP-NEXT: v_mul_hi_u32 v3, v8, v3
; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_mul_hi_u32 v6, v12, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v6, v7, v2
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; CGP-NEXT: v_mul_hi_u32 v7, v13, v2
-; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4]
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v3, v5
+; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v9, v8, v2
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v3
+; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v6
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4]
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2
+; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc
+; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5
; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v4
; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -2154,10 +2136,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; CGP-NEXT: v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v13
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v13
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = srem <2 x i64> %num, <i64 1235195, i64 1235195>
ret <2 x i64> %result
@@ -2193,130 +2175,128 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0
; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v1
-; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v0
-; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc
+; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v0
+; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc
; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v5
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2
-; CHECK-NEXT: v_trunc_f32_e32 v7, v5
-; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v7
-; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v7
-; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
-; CHECK-NEXT: v_mov_b32_e32 v2, v6
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3]
-; CHECK-NEXT: v_mul_lo_u32 v2, v11, v5
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT: v_mul_hi_u32 v7, v8, v5
+; CHECK-NEXT: v_trunc_f32_e32 v5, v5
+; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v5
+; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2
+; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v5
+; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v2, 0
+; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v11, v[6:7]
+; CHECK-NEXT: v_mul_lo_u32 v6, v11, v5
+; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v2, v[7:8]
+; CHECK-NEXT: v_mul_hi_u32 v7, v2, v5
; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5
-; CHECK-NEXT: v_mul_lo_u32 v12, v8, v6
-; CHECK-NEXT: v_mul_lo_u32 v13, v11, v6
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v12
-; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
-; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v12, v2
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v13, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v8, v2, v9
+; CHECK-NEXT: v_mul_lo_u32 v10, v11, v9
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT: v_mul_hi_u32 v7, v2, v9
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7
-; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2
-; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v5, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
-; CHECK-NEXT: v_mov_b32_e32 v2, v6
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3]
-; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v4
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v9
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v9, vcc
-; CHECK-NEXT: v_xor_b32_e32 v7, v2, v9
-; CHECK-NEXT: v_mul_lo_u32 v2, v11, v5
-; CHECK-NEXT: v_mul_lo_u32 v4, v8, v6
-; CHECK-NEXT: v_xor_b32_e32 v10, v3, v9
-; CHECK-NEXT: v_mul_hi_u32 v3, v8, v5
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT: v_mul_hi_u32 v8, v11, v9
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v6, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v2, 0
+; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v11, v[6:7]
+; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v4
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12
+; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v2, v[7:8]
+; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v12, vcc
+; CHECK-NEXT: v_xor_b32_e32 v8, v3, v12
+; CHECK-NEXT: v_mul_lo_u32 v3, v11, v5
+; CHECK-NEXT: v_mul_lo_u32 v6, v2, v9
+; CHECK-NEXT: v_xor_b32_e32 v10, v4, v12
+; CHECK-NEXT: v_mul_hi_u32 v4, v2, v5
; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v3, v11, v6
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2
-; CHECK-NEXT: v_mul_hi_u32 v4, v8, v6
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT: v_mul_hi_u32 v5, v11, v6
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v4, v11, v9
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3
+; CHECK-NEXT: v_mul_hi_u32 v6, v2, v9
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT: v_mul_hi_u32 v6, v11, v9
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2
-; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v4, vcc
; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2
-; CHECK-NEXT: v_mul_lo_u32 v5, v7, v3
-; CHECK-NEXT: v_mul_hi_u32 v6, v7, v2
+; CHECK-NEXT: v_mul_lo_u32 v5, v8, v3
+; CHECK-NEXT: v_mul_hi_u32 v6, v8, v2
; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
-; CHECK-NEXT: v_mul_hi_u32 v8, v10, v3
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CHECK-NEXT: v_mul_lo_u32 v6, v10, v3
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT: v_mul_hi_u32 v5, v7, v3
+; CHECK-NEXT: v_mul_hi_u32 v5, v8, v3
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v2, v4
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v6, 0
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4]
-; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v6, v[3:4]
-; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc
-; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v2, v4
+; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v2
+; CHECK-NEXT: v_mul_hi_u32 v5, v10, v3
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v9, 0
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v5, v4
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4]
+; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v8, v2
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v9, v[4:5]
+; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v10, v6, vcc
+; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v10, v6
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1
-; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v1
+; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5]
; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v2, v0
-; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc
+; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v4, vcc
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1
; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5]
; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v12
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v12
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v12
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc
; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6
; CHECK-NEXT: ; implicit-def: $vgpr3
; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -2351,224 +2331,220 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-LABEL: v_srem_v2i64_pow2_shl_denom:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v8, 0x1000
-; GISEL-NEXT: v_mov_b32_e32 v9, 0
-; GISEL-NEXT: v_lshl_b64 v[4:5], v[8:9], v4
+; GISEL-NEXT: v_mov_b32_e32 v10, 0x1000
+; GISEL-NEXT: v_mov_b32_e32 v11, 0
+; GISEL-NEXT: v_lshl_b64 v[4:5], v[10:11], v4
; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v5
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v5, v7, vcc
+; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v5, v7, vcc
; GISEL-NEXT: v_xor_b32_e32 v5, v4, v7
-; GISEL-NEXT: v_xor_b32_e32 v7, v10, v7
+; GISEL-NEXT: v_xor_b32_e32 v7, v8, v7
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v5
-; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v7
-; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v5
-; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v7, vcc
-; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v10
+; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v7
+; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v5
+; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v7, vcc
+; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v8
; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v4
-; GISEL-NEXT: v_trunc_f32_e32 v12, v10
-; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12
-; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v12
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
-; GISEL-NEXT: v_mov_b32_e32 v4, v11
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v16, v10
-; GISEL-NEXT: v_mul_hi_u32 v17, v13, v10
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10
-; GISEL-NEXT: v_mul_lo_u32 v12, v13, v11
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v17
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v17, v16, v11
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4
-; GISEL-NEXT: v_mul_hi_u32 v12, v13, v11
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v17, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v4
+; GISEL-NEXT: v_trunc_f32_e32 v8, v8
+; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8
+; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v8
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v4, 0
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v16, v[9:10]
+; GISEL-NEXT: v_mul_lo_u32 v9, v16, v8
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v4, v[12:13]
+; GISEL-NEXT: v_mul_lo_u32 v12, v4, v14
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v4, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v8, v16, v8
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9
+; GISEL-NEXT: v_mul_hi_u32 v13, v4, v14
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12
-; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v4
-; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v16, v10, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
-; GISEL-NEXT: v_mov_b32_e32 v4, v11
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT: v_mul_hi_u32 v13, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9
+; GISEL-NEXT: v_add_i32_e32 v19, vcc, v4, v8
+; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v16, v9, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v19, 0
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v16, v[9:10]
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GISEL-NEXT: v_xor_b32_e32 v12, v0, v4
-; GISEL-NEXT: v_mul_lo_u32 v0, v16, v10
-; GISEL-NEXT: v_mul_lo_u32 v14, v13, v11
-; GISEL-NEXT: v_xor_b32_e32 v15, v1, v4
-; GISEL-NEXT: v_mul_hi_u32 v1, v13, v10
-; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v19, v[12:13]
+; GISEL-NEXT: v_xor_b32_e32 v15, v0, v4
+; GISEL-NEXT: v_mul_lo_u32 v0, v16, v8
+; GISEL-NEXT: v_mul_lo_u32 v9, v19, v14
+; GISEL-NEXT: v_xor_b32_e32 v17, v1, v4
+; GISEL-NEXT: v_mul_hi_u32 v1, v19, v8
+; GISEL-NEXT: v_mul_hi_u32 v8, v16, v8
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0
-; GISEL-NEXT: v_mul_hi_u32 v14, v13, v11
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11
+; GISEL-NEXT: v_mul_lo_u32 v1, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT: v_mul_hi_u32 v9, v19, v14
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT: v_mul_hi_u32 v9, v16, v14
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v0
-; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v16, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v15, v10
-; GISEL-NEXT: v_mul_lo_u32 v14, v12, v11
-; GISEL-NEXT: v_lshl_b64 v[0:1], v[8:9], v6
-; GISEL-NEXT: v_mul_hi_u32 v6, v12, v10
-; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v8, v15, v11
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT: v_mul_hi_u32 v9, v12, v11
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v19, v0
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v8, v17, v0
+; GISEL-NEXT: v_mul_lo_u32 v9, v15, v1
+; GISEL-NEXT: v_mul_hi_u32 v12, v15, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0
+; GISEL-NEXT: v_lshl_b64 v[10:11], v[10:11], v6
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v8, v6
-; GISEL-NEXT: v_mul_hi_u32 v8, v15, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v8, v6
-; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v13, 0
-; GISEL-NEXT: v_xor_b32_e32 v6, v0, v8
-; GISEL-NEXT: v_xor_b32_e32 v8, v1, v8
-; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v6
-; GISEL-NEXT: v_cvt_f32_u32_e32 v16, v8
-; GISEL-NEXT: v_mov_b32_e32 v0, v10
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, v[0:1]
-; GISEL-NEXT: v_mac_f32_e32 v14, 0x4f800000, v16
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v14
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v13, v[0:1]
-; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6
-; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v10
-; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1
-; GISEL-NEXT: v_trunc_f32_e32 v13, v10
-; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v13
-; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1
-; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13
-; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v8, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v14, 0
-; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v12, v9
-; GISEL-NEXT: v_mov_b32_e32 v1, v11
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v13, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v1, v13, v10
-; GISEL-NEXT: v_subb_u32_e64 v18, s[4:5], v15, v0, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v14, v[11:12]
-; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v15, v0
-; GISEL-NEXT: v_mul_lo_u32 v12, v14, v11
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v18, v7
-; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc
-; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12
-; GISEL-NEXT: v_mul_hi_u32 v12, v14, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[6:7]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[6:7]
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v9, v5
-; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v0, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v7
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v12, v17, v1
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT: v_mul_hi_u32 v9, v15, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v0, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v0
+; GISEL-NEXT: v_mul_hi_u32 v9, v17, v1
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v9, v8
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v12, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v7, v14, v[8:9]
+; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v15, v0
+; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v17, v12, vcc
+; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v17, v12
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v5
; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9]
-; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v7
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7]
-; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_hi_u32 v1, v13, v10
-; GISEL-NEXT: v_mul_lo_u32 v10, v13, v11
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0
-; GISEL-NEXT: v_mul_hi_u32 v15, v14, v11
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15
-; GISEL-NEXT: v_mul_hi_u32 v11, v13, v11
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v7
+; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v13, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v12, v1, v6, s[4:5]
+; GISEL-NEXT: v_subbrev_u32_e64 v16, s[4:5], 0, v0, vcc
+; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v11
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v10, v1
+; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v11, v1, s[4:5]
+; GISEL-NEXT: v_xor_b32_e32 v6, v6, v1
+; GISEL-NEXT: v_xor_b32_e32 v8, v8, v1
+; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v6
+; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v5
+; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v9
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v16, v7
+; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v0, v7, vcc
+; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1
+; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
+; GISEL-NEXT: v_trunc_f32_e32 v1, v1
+; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
+; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0
+; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v11, v10, v11, s[4:5]
+; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v17, 0
+; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v8, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v18, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v21, v18, v0
+; GISEL-NEXT: v_mul_hi_u32 v22, v17, v0
+; GISEL-NEXT: v_mul_hi_u32 v23, v18, v0
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v15, v5
+; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v7, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v15, v0, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v17, v[9:10]
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc
+; GISEL-NEXT: v_mul_lo_u32 v1, v17, v0
+; GISEL-NEXT: v_mul_lo_u32 v10, v18, v0
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v21, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v22
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v0
-; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v11, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[1:2]
-; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4
-; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v11, v[9:10]
-; GISEL-NEXT: v_cndmask_b32_e32 v7, v18, v7, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
-; GISEL-NEXT: v_xor_b32_e32 v12, v2, v5
-; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0
-; GISEL-NEXT: v_mul_lo_u32 v10, v11, v9
-; GISEL-NEXT: v_xor_b32_e32 v14, v3, v5
-; GISEL-NEXT: v_mul_hi_u32 v3, v11, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1
+; GISEL-NEXT: v_mul_hi_u32 v9, v17, v0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v23
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v1
+; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v18, v0, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v15, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v13, v7, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v16, v[1:2]
+; GISEL-NEXT: v_xor_b32_e32 v1, v7, v4
+; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v20, v15, v[9:10]
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc
+; GISEL-NEXT: v_xor_b32_e32 v12, v2, v7
+; GISEL-NEXT: v_mul_lo_u32 v2, v16, v0
+; GISEL-NEXT: v_mul_lo_u32 v9, v15, v11
+; GISEL-NEXT: v_xor_b32_e32 v13, v3, v7
+; GISEL-NEXT: v_mul_hi_u32 v3, v15, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v13, v9
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2
-; GISEL-NEXT: v_mul_hi_u32 v10, v11, v9
+; GISEL-NEXT: v_mul_lo_u32 v3, v16, v11
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2
+; GISEL-NEXT: v_mul_hi_u32 v9, v15, v11
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10
-; GISEL-NEXT: v_mul_hi_u32 v9, v13, v9
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
+; GISEL-NEXT: v_mul_hi_u32 v9, v16, v11
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v16, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0
; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2
; GISEL-NEXT: v_mul_hi_u32 v10, v12, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
-; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4
+; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v10, v14, v2
+; GISEL-NEXT: v_mul_lo_u32 v10, v13, v2
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3
; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
@@ -2577,26 +2553,25 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3
-; GISEL-NEXT: v_mul_hi_u32 v10, v14, v2
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v0
-; GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT: v_mul_hi_u32 v9, v13, v2
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v0, v[3:4]
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v4, vcc
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc
; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v11, v[9:10]
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc
-; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3
+; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v3, vcc
+; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v13, v3
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v8
; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5]
; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v2, v6
; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v8
@@ -2611,13 +2586,13 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
; GISEL-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v5
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v5
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, v2, v7
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_srem_v2i64_pow2_shl_denom:
@@ -2645,103 +2620,100 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_xor_b32_e32 v1, v4, v1
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v0
; CGP-NEXT: v_cvt_f32_u32_e32 v10, v1
-; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v0
-; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v1, vcc
+; CGP-NEXT: v_sub_i32_e32 v17, vcc, 0, v0
+; CGP-NEXT: v_subb_u32_e32 v18, vcc, 0, v1, vcc
; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v10
; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4
; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v4
-; CGP-NEXT: v_trunc_f32_e32 v12, v10
-; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12
-; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v16, v12
-; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
-; CGP-NEXT: v_mov_b32_e32 v4, v11
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
-; CGP-NEXT: v_mul_lo_u32 v4, v16, v10
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT: v_mul_hi_u32 v12, v13, v10
+; CGP-NEXT: v_trunc_f32_e32 v10, v10
+; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v10
+; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
+; CGP-NEXT: v_cvt_u32_f32_e32 v16, v10
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v4, 0
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v16, v[11:12]
+; CGP-NEXT: v_mul_lo_u32 v11, v16, v10
+; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v4, v[12:13]
+; CGP-NEXT: v_mul_hi_u32 v12, v4, v10
; CGP-NEXT: v_mul_hi_u32 v10, v16, v10
-; CGP-NEXT: v_mul_lo_u32 v17, v13, v11
-; CGP-NEXT: v_mul_lo_u32 v18, v16, v11
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v17
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT: v_mul_hi_u32 v12, v13, v11
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v17, v4
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v18, v10
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v13, v4, v14
+; CGP-NEXT: v_mul_lo_u32 v15, v16, v14
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT: v_mul_hi_u32 v12, v4, v14
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12
; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12
-; CGP-NEXT: v_mul_hi_u32 v11, v16, v11
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v4
-; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v10, vcc
-; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
-; CGP-NEXT: v_mov_b32_e32 v4, v11
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
-; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v14
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v14, vcc
-; CGP-NEXT: v_xor_b32_e32 v12, v4, v14
-; CGP-NEXT: v_mul_lo_u32 v4, v16, v10
-; CGP-NEXT: v_mul_lo_u32 v9, v13, v11
-; CGP-NEXT: v_xor_b32_e32 v15, v8, v14
-; CGP-NEXT: v_mul_hi_u32 v8, v13, v10
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT: v_mul_hi_u32 v13, v16, v14
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10
+; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v11, vcc
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v4, 0
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v16, v[11:12]
+; CGP-NEXT: v_ashrrev_i32_e32 v17, 31, v9
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v17
+; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v4, v[12:13]
+; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v17, vcc
+; CGP-NEXT: v_xor_b32_e32 v15, v8, v17
+; CGP-NEXT: v_mul_lo_u32 v8, v16, v10
+; CGP-NEXT: v_mul_lo_u32 v11, v4, v14
+; CGP-NEXT: v_xor_b32_e32 v18, v9, v17
+; CGP-NEXT: v_mul_hi_u32 v9, v4, v10
; CGP-NEXT: v_mul_hi_u32 v10, v16, v10
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v8, v16, v11
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4
-; CGP-NEXT: v_mul_hi_u32 v9, v13, v11
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9
-; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT: v_mul_hi_u32 v10, v16, v11
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v9, v16, v14
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8
+; CGP-NEXT: v_mul_hi_u32 v11, v4, v14
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT: v_mul_hi_u32 v11, v16, v14
; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v16, v8, vcc
-; CGP-NEXT: v_mul_lo_u32 v9, v15, v4
-; CGP-NEXT: v_mul_lo_u32 v10, v12, v8
-; CGP-NEXT: v_mul_hi_u32 v11, v12, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v13, v15, v8
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; CGP-NEXT: v_addc_u32_e32 v8, vcc, v16, v9, vcc
+; CGP-NEXT: v_mul_lo_u32 v9, v18, v4
+; CGP-NEXT: v_mul_lo_u32 v10, v15, v8
+; CGP-NEXT: v_mul_hi_u32 v11, v15, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v18, v4
; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v11, v15, v8
+; CGP-NEXT: v_mul_lo_u32 v11, v18, v8
; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT: v_mul_hi_u32 v10, v12, v8
+; CGP-NEXT: v_mul_hi_u32 v10, v15, v8
; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4
; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v9
-; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v11, 0
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v4
-; CGP-NEXT: v_mov_b32_e32 v4, v9
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v10, v[4:5]
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, v12, v8
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v1, v11, v[9:10]
-; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v15, v9, vcc
-; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v15, v9
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v11, v18, v8
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v9
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v4, 0
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v11, v10
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v0, v12, v[9:10]
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v1, v4, v[10:11]
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, v15, v8
+; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v18, v12, vcc
+; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v18, v12
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v0
@@ -2754,11 +2726,11 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v1
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v0
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v1
; CGP-NEXT: v_subb_u32_e32 v1, vcc, v9, v1, vcc
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v0
-; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v15, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5]
; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
; CGP-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc
@@ -2766,10 +2738,10 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v14
-; CGP-NEXT: v_xor_b32_e32 v1, v1, v14
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v14
-; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v14, vcc
+; CGP-NEXT: v_xor_b32_e32 v0, v0, v17
+; CGP-NEXT: v_xor_b32_e32 v1, v1, v17
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v17
+; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v17, vcc
; CGP-NEXT: ; implicit-def: $vgpr11_vgpr12
; CGP-NEXT: ; implicit-def: $vgpr8
; CGP-NEXT: .LBB8_2: ; %Flow1
@@ -2819,117 +2791,115 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_xor_b32_e32 v3, v4, v3
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
-; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v2
-; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc
+; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v2
+; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc
; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6
; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4
; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4
; CGP-NEXT: v_trunc_f32_e32 v6, v6
; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6
-; CGP-NEXT: v_cvt_u32_f32_e32 v11, v4
+; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
-; CGP-NEXT: v_mov_b32_e32 v4, v9
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5]
-; CGP-NEXT: v_mul_lo_u32 v4, v6, v8
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT: v_mul_hi_u32 v10, v11, v8
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v4, 0
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v6, v[9:10]
+; CGP-NEXT: v_mul_lo_u32 v9, v6, v8
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[10:11]
+; CGP-NEXT: v_mul_hi_u32 v10, v4, v8
; CGP-NEXT: v_mul_hi_u32 v8, v6, v8
-; CGP-NEXT: v_mul_lo_u32 v14, v11, v9
-; CGP-NEXT: v_mul_lo_u32 v15, v6, v9
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v11, v9
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v11, v4, v12
+; CGP-NEXT: v_mul_lo_u32 v13, v6, v12
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; CGP-NEXT: v_mul_hi_u32 v10, v4, v12
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10
-; CGP-NEXT: v_mul_hi_u32 v9, v6, v9
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v4
-; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc
-; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
-; CGP-NEXT: v_mov_b32_e32 v4, v9
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5]
-; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v12
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v12, vcc
-; CGP-NEXT: v_xor_b32_e32 v7, v4, v12
-; CGP-NEXT: v_mul_lo_u32 v4, v6, v8
-; CGP-NEXT: v_mul_lo_u32 v10, v11, v9
-; CGP-NEXT: v_xor_b32_e32 v13, v5, v12
-; CGP-NEXT: v_mul_hi_u32 v5, v11, v8
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT: v_mul_hi_u32 v11, v6, v12
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v9, vcc
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v4, 0
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v6, v[9:10]
+; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v7
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[10:11]
+; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v14, vcc
+; CGP-NEXT: v_xor_b32_e32 v10, v5, v14
+; CGP-NEXT: v_mul_lo_u32 v5, v6, v8
+; CGP-NEXT: v_mul_lo_u32 v9, v4, v12
+; CGP-NEXT: v_xor_b32_e32 v11, v7, v14
+; CGP-NEXT: v_mul_hi_u32 v7, v4, v8
; CGP-NEXT: v_mul_hi_u32 v8, v6, v8
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v5, v6, v9
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT: v_mul_hi_u32 v10, v11, v9
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; CGP-NEXT: v_mul_hi_u32 v9, v6, v9
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; CGP-NEXT: v_mul_lo_u32 v7, v6, v12
; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4
-; CGP-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc
-; CGP-NEXT: v_mul_lo_u32 v6, v13, v4
-; CGP-NEXT: v_mul_lo_u32 v8, v7, v5
-; CGP-NEXT: v_mul_hi_u32 v9, v7, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v13, v4
-; CGP-NEXT: v_mul_hi_u32 v10, v13, v5
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; CGP-NEXT: v_mul_hi_u32 v9, v4, v12
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9
-; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v9, v13, v5
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT: v_mul_hi_u32 v8, v7, v5
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v4, v6
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT: v_mul_hi_u32 v9, v6, v12
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT: v_addc_u32_e32 v5, vcc, v6, v7, vcc
+; CGP-NEXT: v_mul_lo_u32 v6, v11, v4
+; CGP-NEXT: v_mul_lo_u32 v7, v10, v5
+; CGP-NEXT: v_mul_hi_u32 v8, v10, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v11, v4
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6]
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, v7, v4
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[5:6]
-; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v13, v5, vcc
-; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT: v_mul_lo_u32 v8, v11, v5
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CGP-NEXT: v_mul_hi_u32 v7, v10, v5
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v4, v6
+; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v4
+; CGP-NEXT: v_mul_hi_u32 v7, v11, v5
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v12, 0
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v7, v6
+; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6]
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v12, v[6:7]
+; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v11, v8, vcc
+; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v11, v8
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3
-; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v3
+; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc
; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5]
; CGP-NEXT: v_sub_i32_e32 v8, vcc, v4, v2
-; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc
+; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2
; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2
; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
@@ -2938,11 +2908,11 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
-; CGP-NEXT: v_xor_b32_e32 v2, v2, v12
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v12
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v14
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v14
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v14
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc
; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10
; CGP-NEXT: ; implicit-def: $vgpr5
; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -3004,15 +2974,15 @@ define i64 @v_srem_i64_24bit(i64 %num, i64 %den) {
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3
-; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0
; CGP-NEXT: v_rcp_f32_e32 v1, v1
; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v1
; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v3
-; CGP-NEXT: v_mul_lo_u32 v1, v1, v4
-; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v1, 0
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v2
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
+; CGP-NEXT: v_mul_lo_u32 v5, v1, v4
+; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v5, 0
+; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
; CGP-NEXT: v_mul_lo_u32 v0, v1, v3
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v5, v0
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v3
@@ -3035,196 +3005,192 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4
; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1
-; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v7, 0
-; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v1
-; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v7
+; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v9, 0
+; GISEL-NEXT: v_sub_i32_e32 v12, vcc, 0, v1
+; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v9
; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
-; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2
+; GISEL-NEXT: v_subb_u32_e64 v13, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3
-; GISEL-NEXT: v_trunc_f32_e32 v5, v4
-; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5
-; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v3
-; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5]
-; GISEL-NEXT: v_mul_hi_u32 v12, v8, v3
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3
-; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3
-; GISEL-NEXT: v_mul_lo_u32 v13, v8, v4
-; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4
+; GISEL-NEXT: v_trunc_f32_e32 v4, v4
+; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v3
+; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v4
+; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v10, v[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v11, v[7:8]
+; GISEL-NEXT: v_mul_lo_u32 v5, v10, v3
+; GISEL-NEXT: v_mul_hi_u32 v7, v11, v3
+; GISEL-NEXT: v_mul_lo_u32 v8, v11, v4
+; GISEL-NEXT: v_mul_hi_u32 v3, v10, v3
+; GISEL-NEXT: v_mul_lo_u32 v14, v10, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v11, v4
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT: v_mul_hi_u32 v4, v10, v4
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v3
-; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5]
-; GISEL-NEXT: v_mul_hi_u32 v9, v8, v3
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5]
-; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0
-; GISEL-NEXT: v_mul_lo_u32 v0, v11, v3
-; GISEL-NEXT: v_mul_lo_u32 v5, v8, v4
-; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v3
+; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v10, v[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v11, v[7:8]
+; GISEL-NEXT: v_mul_lo_u32 v5, v10, v3
+; GISEL-NEXT: v_mul_hi_u32 v8, v11, v3
+; GISEL-NEXT: v_mul_lo_u32 v7, v11, v4
+; GISEL-NEXT: v_mul_hi_u32 v3, v10, v3
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v9, v11, v4
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT: v_mul_hi_u32 v5, v8, v4
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v8, v10, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v7, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT: v_mul_hi_u32 v4, v10, v4
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v3, vcc
-; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0
-; GISEL-NEXT: v_mul_lo_u32 v5, v10, v8
-; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6
-; GISEL-NEXT: v_mul_hi_u32 v6, v10, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_mul_lo_u32 v5, 0, v8
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT: v_mul_hi_u32 v6, v10, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v4
-; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v9, 0
-; GISEL-NEXT: v_mul_hi_u32 v6, 0, v8
-; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v7
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v0
-; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v6, v[0:1]
-; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v7
-; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v0
-; GISEL-NEXT: v_trunc_f32_e32 v11, v7
-; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v11
-; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v0
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v3
+; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v10, v4, vcc
+; GISEL-NEXT: v_mul_lo_u32 v7, 0, v5
+; GISEL-NEXT: v_mul_lo_u32 v8, v0, v4
+; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6
+; GISEL-NEXT: v_mul_hi_u32 v6, v0, v5
+; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT: v_mul_lo_u32 v8, 0, v4
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT: v_mul_hi_u32 v7, v0, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v5, v6
+; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v3
+; GISEL-NEXT: v_mul_hi_u32 v8, 0, v4
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v10, 0
+; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v9
; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v3
-; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v12, 0
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
-; GISEL-NEXT: v_mov_b32_e32 v0, v8
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[5:6]
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v11
; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v12, v[8:9]
-; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v10, v4
-; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], 0, v5
-; GISEL-NEXT: v_mul_lo_u32 v4, v11, v7
-; GISEL-NEXT: v_mul_lo_u32 v5, v12, v8
-; GISEL-NEXT: v_mul_hi_u32 v9, v12, v7
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v9, v11, v8
-; GISEL-NEXT: v_mul_hi_u32 v7, v11, v7
-; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4
-; GISEL-NEXT: v_mul_hi_u32 v5, v12, v8
-; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v10, v[6:7]
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
+; GISEL-NEXT: v_trunc_f32_e32 v6, v6
+; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6
+; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v6
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v11, 0
+; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, v8, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, v[6:7]
+; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], 0, v8
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v11, v[9:10]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5
+; GISEL-NEXT: v_mul_lo_u32 v9, v11, v6
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v4, s[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v4, v11, v5
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v7, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v7, v12, v6
+; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v9, v4
+; GISEL-NEXT: v_mul_hi_u32 v9, v11, v6
; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7
-; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9
+; GISEL-NEXT: v_mul_hi_u32 v6, v12, v6
; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5]
; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5
-; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v4
-; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v11, v5, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v7, 0
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v0, vcc
-; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v9, -1, v6, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v8, v[0:1]
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v1
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v7, v[5:6]
-; GISEL-NEXT: v_mul_lo_u32 v12, v8, v4
-; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v11, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v7, v5
-; GISEL-NEXT: v_mul_hi_u32 v14, v7, v4
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11
-; GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v14, v8, v5
-; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT: v_mul_hi_u32 v13, v7, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v8, v5, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, 0, v4
-; GISEL-NEXT: v_mul_lo_u32 v8, v2, v7
-; GISEL-NEXT: v_mul_hi_u32 v13, v2, v4
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v0, v1
-; GISEL-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v11, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT: v_mul_lo_u32 v8, 0, v7
-; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT: v_mul_hi_u32 v13, v2, v7
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v6, v5
+; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v4
+; GISEL-NEXT: v_addc_u32_e64 v12, s[4:5], v12, v5, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v11, 0
+; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, v[5:6]
+; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v0, v1
+; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v8, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v11, v[6:7]
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v13, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v6, v12, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v11, v8
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
+; GISEL-NEXT: v_cndmask_b32_e32 v9, -1, v5, vcc
+; GISEL-NEXT: v_mul_hi_u32 v5, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v4, v5
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; GISEL-NEXT: v_mul_hi_u32 v6, 0, v7
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v6, v[0:1]
+; GISEL-NEXT: v_mul_lo_u32 v6, v12, v8
+; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v7, v11, v8
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v12, v8
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4
+; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v12, v5, vcc
+; GISEL-NEXT: v_mul_lo_u32 v6, 0, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v2, v5
+; GISEL-NEXT: v_mul_hi_u32 v11, v2, v4
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v16, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT: v_mul_lo_u32 v7, 0, v5
+; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v2, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v4, v6
+; GISEL-NEXT: v_mul_hi_u32 v12, 0, v5
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v11, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v8, v[5:6]
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v7, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[5:6]
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v8, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v11, v[6:7]
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v8
+; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v8, vcc
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v3
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -3264,15 +3230,15 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0
; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v7
; CGP-NEXT: v_cvt_u32_f32_e32 v6, v0
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v1
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v5, 0
; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v4
-; CGP-NEXT: v_mul_lo_u32 v0, v0, v6
-; CGP-NEXT: v_mul_lo_u32 v5, v1, v3
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v6, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0
+; CGP-NEXT: v_mul_lo_u32 v5, v0, v6
+; CGP-NEXT: v_mul_lo_u32 v7, v1, v3
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
+; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v7
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v1
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v6, 0
; CGP-NEXT: v_sub_i32_e32 v7, vcc, v5, v3
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3
; CGP-NEXT: v_mul_lo_u32 v6, v1, v4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
index 1812e17..10e83b7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
@@ -189,15 +189,11 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
; GFX10-NEXT: v_mov_b32_e32 v2, s1
; GFX10-NEXT: s_lshr_b32 s6, s1, 16
; GFX10-NEXT: v_mov_b32_e32 v4, s4
-; GFX10-NEXT: s_lshr_b32 s1, s1, 24
; GFX10-NEXT: s_lshr_b32 s8, s2, 16
-; GFX10-NEXT: s_and_b32 s9, 0xffff, s2
; GFX10-NEXT: s_lshr_b32 s5, s5, 8
; GFX10-NEXT: v_mov_b32_e32 v5, s0
; GFX10-NEXT: s_lshr_b32 s0, s7, 8
; GFX10-NEXT: v_mov_b32_e32 v6, s6
-; GFX10-NEXT: v_mov_b32_e32 v7, s1
-; GFX10-NEXT: s_lshr_b32 s1, s9, 8
; GFX10-NEXT: v_mov_b32_e32 v8, s5
; GFX10-NEXT: v_mov_b32_e32 v9, s0
; GFX10-NEXT: ds_write_b8 v1, v0
@@ -208,18 +204,22 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
; GFX10-NEXT: ds_write_b8 v1, v8 offset:1
; GFX10-NEXT: ds_write_b8 v1, v9 offset:5
; GFX10-NEXT: v_mov_b32_e32 v0, s8
-; GFX10-NEXT: v_mov_b32_e32 v3, s2
-; GFX10-NEXT: v_mov_b32_e32 v10, s1
+; GFX10-NEXT: s_lshr_b32 s1, s1, 24
+; GFX10-NEXT: s_and_b32 s9, 0xffff, s2
; GFX10-NEXT: s_lshr_b32 s0, s2, 24
-; GFX10-NEXT: ds_write_b8 v1, v7 offset:7
-; GFX10-NEXT: ds_write_b8 v1, v3 offset:8
-; GFX10-NEXT: ds_write_b8 v1, v10 offset:9
+; GFX10-NEXT: v_mov_b32_e32 v7, s1
+; GFX10-NEXT: s_lshr_b32 s1, s9, 8
+; GFX10-NEXT: v_mov_b32_e32 v3, s2
; GFX10-NEXT: ds_write_b8 v1, v0 offset:10
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: s_and_b32 s0, 0xffff, s3
-; GFX10-NEXT: s_lshr_b32 s1, s3, 16
+; GFX10-NEXT: v_mov_b32_e32 v10, s1
; GFX10-NEXT: s_lshr_b32 s0, s0, 8
+; GFX10-NEXT: s_lshr_b32 s1, s3, 16
; GFX10-NEXT: v_mov_b32_e32 v2, s3
+; GFX10-NEXT: ds_write_b8 v1, v7 offset:7
+; GFX10-NEXT: ds_write_b8 v1, v3 offset:8
+; GFX10-NEXT: ds_write_b8 v1, v10 offset:9
; GFX10-NEXT: v_mov_b32_e32 v3, s0
; GFX10-NEXT: s_lshr_b32 s0, s3, 24
; GFX10-NEXT: v_mov_b32_e32 v4, s1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll
index 8b5958d..e2fb7045 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll
@@ -50,7 +50,7 @@ define i16 @s_sub_i16(i16 inreg %a, i16 inreg %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_sub_co_i32 s0, s0, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%c = sub i16 %a, %b
@@ -145,7 +145,7 @@ define i32 @s_sub_i32(i32 inreg %a, i32 inreg %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_sub_co_i32 s0, s0, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%c = sub i32 %a, %b
@@ -299,7 +299,7 @@ define i64 @s_sub_i64(i64 inreg %a, i64 inreg %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: s_setpc_b64 s[30:31]
%c = sub i64 %a, %b
@@ -350,7 +350,7 @@ define i64 @v_sub_i64(i64 %a, i64 %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX12-NEXT: s_setpc_b64 s[30:31]
%c = sub i64 %a, %b
@@ -438,7 +438,7 @@ define void @s_usubo_usube(i64 inreg %a, i64 inreg %b, ptr addrspace(1) %res, pt
; GFX12-NEXT: s_sub_co_u32 s0, s0, s2
; GFX12-NEXT: s_sub_co_ci_u32 s1, s1, s3
; GFX12-NEXT: s_cselect_b32 s2, 1, 0
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
; GFX12-NEXT: v_mov_b32_e32 v6, s2
; GFX12-NEXT: global_store_b64 v[0:1], v[4:5], off
@@ -518,9 +518,9 @@ define void @v_usubo_usube(i64 %a, i64 %b, ptr addrspace(1) %res, ptr addrspace(
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX12-NEXT: global_store_b64 v[4:5], v[0:1], off
; GFX12-NEXT: global_store_b32 v[6:7], v2, off
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 9e412b6..c50b491 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -129,68 +129,67 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX8-NEXT: v_trunc_f32_e32 v2, v1
-; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
-; GFX8-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT: v_trunc_f32_e32 v1, v1
+; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1
+; GFX8-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v1
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0
; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1
; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0
; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0
-; GFX8-NEXT: v_mul_hi_u32 v5, s9, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
@@ -203,54 +202,55 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v3, v[1:2]
-; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s8, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2]
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0
+; GFX8-NEXT: v_mul_hi_u32 v3, s9, v1
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v3, v2
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v2, s9
-; GFX8-NEXT: v_mov_b32_e32 v5, s11
-; GFX8-NEXT: v_subb_u32_e64 v7, s[0:1], v2, v1, vcc
-; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s9, v1
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v7
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v6
+; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s8, v0
+; GFX8-NEXT: v_subb_u32_e64 v5, s[0:1], v2, v4, vcc
+; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s9, v4
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v5
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v7
-; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s10, v6
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v5
+; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s10, v3
; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v0, vcc
-; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v4
-; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
+; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v6
+; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1]
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v8
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2
-; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v4
+; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v8
-; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s10, v2
+; GFX8-NEXT: v_subrev_u32_e32 v14, vcc, s10, v4
; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1]
; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9
-; GFX8-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v0, vcc
+; GFX8-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v0, vcc
; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
+; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v14, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v15, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v9, v10, v13, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v0, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v13, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v9, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v14, vcc
; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
@@ -268,66 +268,67 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX9-NEXT: v_trunc_f32_e32 v2, v1
-; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
-; GFX9-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
-; GFX9-NEXT: v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
+; GFX9-NEXT: v_trunc_f32_e32 v1, v1
+; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1
+; GFX9-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v1
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT: v_mul_hi_u32 v4, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
+; GFX9-NEXT: v_add_u32_e32 v1, v3, v1
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v4
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT: v_mov_b32_e32 v8, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
; GFX9-NEXT: v_mul_lo_u32 v2, s17, v0
; GFX9-NEXT: v_mul_lo_u32 v3, s16, v1
; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0
; GFX9-NEXT: v_mul_hi_u32 v0, s17, v0
-; GFX9-NEXT: v_mul_hi_u32 v6, s17, v1
+; GFX9-NEXT: v_mul_hi_u32 v5, s17, v1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
@@ -339,53 +340,52 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v5, 0
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v6, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v3, v[1:2]
-; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s16, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v5, v[1:2]
+; GFX9-NEXT: v_add3_u32 v7, v3, v2, v5
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s18, v7, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v1, s19
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s19, v6, v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, s17
-; GFX9-NEXT: v_mov_b32_e32 v4, s19
-; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v2, v1, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v8
-; GFX9-NEXT: v_sub_u32_e32 v0, s17, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v7
+; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s16, v0
+; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v2, v4, vcc
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v5
+; GFX9-NEXT: v_sub_u32_e32 v0, s17, v4
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v8
-; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s18, v7
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v5
+; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s18, v3
; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v0, vcc
-; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5
-; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
+; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v6
+; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v7, s[0:1]
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v9
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v4
+; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v9
-; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s18, v2
+; GFX9-NEXT: v_subrev_co_u32_e32 v15, vcc, s18, v4
; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1]
; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10
-; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc
+; GFX9-NEXT: v_subbrev_co_u32_e32 v16, vcc, 0, v0, vcc
; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v13, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v14, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
-; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v10, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v15, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[0:1]
-; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[12:13]
-; GFX9-NEXT: global_store_dwordx2 v6, v[2:3], s[14:15]
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v14, vcc
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v15, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v16, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v8, v[0:1], s[12:13]
+; GFX9-NEXT: global_store_dwordx2 v8, v[2:3], s[14:15]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: udivrem_i64:
@@ -468,31 +468,31 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
; GFX10-NEXT: v_add_co_u32 v0, s0, v5, v0
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0
+; GFX10-NEXT: v_mul_hi_u32 v5, s17, v1
; GFX10-NEXT: v_add_nc_u32_e32 v2, v6, v2
; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v3
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
-; GFX10-NEXT: v_add_co_u32 v5, s0, v0, v2
-; GFX10-NEXT: v_mul_hi_u32 v2, s17, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s18, v5, 0
-; GFX10-NEXT: v_add3_u32 v3, v3, v6, v2
+; GFX10-NEXT: v_add_co_u32 v6, s0, v0, v2
+; GFX10-NEXT: v_add_nc_u32_e32 v2, v4, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s18, v6, 0
+; GFX10-NEXT: v_add3_u32 v3, v2, v3, v5
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s18, v3, v[1:2]
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s19, v5, v[1:2]
-; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v5, 1
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s19, v6, v[1:2]
+; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v6, 1
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v7, vcc_lo, s16, v0
-; GFX10-NEXT: v_sub_nc_u32_e32 v6, s17, v1
+; GFX10-NEXT: v_sub_nc_u32_e32 v5, s17, v1
; GFX10-NEXT: v_sub_co_ci_u32_e64 v8, s0, s17, v1, vcc_lo
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s19, v6, vcc_lo
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s19, v5, vcc_lo
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s18, v7
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v7, s18
+; GFX10-NEXT: v_sub_co_u32 v5, vcc_lo, v7, s18
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v0, vcc_lo
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s19, v8
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s19, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s18, v6
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s18, v5
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s19, v9
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0
@@ -503,18 +503,18 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s19, v8
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11
; GFX10-NEXT: v_cndmask_b32_e64 v1, v10, v1, s0
-; GFX10-NEXT: v_sub_co_u32 v10, s0, v6, s18
+; GFX10-NEXT: v_sub_co_u32 v10, s0, v5, s18
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v0, s0, 0, v0, s0
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v13, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v14, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v10, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v0, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v9, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v2, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v2, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v1, s0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v4, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v8, v6, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v8, v5, s0
; GFX10-NEXT: global_store_dwordx2 v9, v[0:1], s[12:13]
; GFX10-NEXT: global_store_dwordx2 v9, v[2:3], s[14:15]
; GFX10-NEXT: s_endpgm
@@ -1005,72 +1005,70 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX8-NEXT: v_mov_b32_e32 v9, s13
; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX8-NEXT: v_trunc_f32_e32 v2, v1
-; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
-; GFX8-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT: v_trunc_f32_e32 v1, v1
+; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1
+; GFX8-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v1
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT: s_sub_u32 s2, 0, s14
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1
-; GFX8-NEXT: s_subb_u32 s3, 0, s15
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0
; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1
; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0
; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0
-; GFX8-NEXT: v_mov_b32_e32 v5, s13
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
@@ -1083,138 +1081,138 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v0, v2
-; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v7, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v2
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v8, v[1:2]
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s13, v7, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v3, s9
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v0, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0
+; GFX8-NEXT: v_mul_hi_u32 v3, s9, v1
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v8, 0
+; GFX8-NEXT: v_add_u32_e64 v17, s[2:3], 1, v8
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v3, v2
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v10, v[1:2]
; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s8, v0
-; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v3, v2, vcc
-; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s9, v2
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s13, v8, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v2, s9
+; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s14
+; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v2, v4, vcc
+; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s9, v4
+; GFX8-NEXT: v_cvt_f32_u32_e32 v4, s15
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v0
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX8-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4
+; GFX8-NEXT: v_add_f32_e32 v4, v4, v5
+; GFX8-NEXT: v_rcp_iflag_f32_e32 v4, v4
+; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v9, v3, v4, s[0:1]
-; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s15
-; GFX8-NEXT: v_cvt_f32_u32_e32 v4, s14
-; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v2, v5, vcc
-; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f800000, v3
-; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v1
-; GFX8-NEXT: v_subbrev_u32_e64 v11, s[0:1], 0, v6, vcc
-; GFX8-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; GFX8-NEXT: v_subb_u32_e32 v12, vcc, v2, v9, vcc
+; GFX8-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v4
+; GFX8-NEXT: v_cndmask_b32_e64 v11, v3, v6, s[0:1]
; GFX8-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
-; GFX8-NEXT: v_trunc_f32_e32 v4, v3
-; GFX8-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4
-; GFX8-NEXT: v_add_f32_e32 v2, v3, v2
-; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v2
-; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v7
-; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v8, s[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v4
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v11
-; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v5, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10
-; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v11
-; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1]
-; GFX8-NEXT: v_mul_lo_u32 v4, v15, v2
-; GFX8-NEXT: v_mul_lo_u32 v17, v12, v3
-; GFX8-NEXT: v_mul_hi_u32 v6, v12, v2
-; GFX8-NEXT: v_mul_hi_u32 v2, v15, v2
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v17
-; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v6, v15, v3
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v17, v4
-; GFX8-NEXT: v_mul_hi_u32 v17, v12, v3
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v17
-; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v17
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, 1, v13
-; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v14, vcc
-; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v10
-; GFX8-NEXT: v_mul_hi_u32 v3, v15, v3
-; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v5, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
+; GFX8-NEXT: v_trunc_f32_e32 v3, v3
+; GFX8-NEXT: v_mul_f32_e32 v4, 0xcf800000, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v4, v2
+; GFX8-NEXT: v_cvt_u32_f32_e32 v13, v2
+; GFX8-NEXT: s_sub_u32 s8, 0, s14
+; GFX8-NEXT: v_cvt_u32_f32_e32 v14, v3
+; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s12, v1
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v13, 0
+; GFX8-NEXT: v_subbrev_u32_e64 v16, s[0:1], 0, v12, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v14, v[3:4]
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v16
+; GFX8-NEXT: s_subb_u32 s9, 0, s15
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s9, v13, v[4:5]
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v15
+; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX8-NEXT: v_mul_lo_u32 v5, v14, v2
+; GFX8-NEXT: v_mul_lo_u32 v7, v13, v6
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v16
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GFX8-NEXT: v_mul_hi_u32 v4, v13, v2
+; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v7
+; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1]
+; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v5, v4
+; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v12, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1]
+; GFX8-NEXT: v_mul_hi_u32 v2, v14, v2
+; GFX8-NEXT: v_mul_lo_u32 v9, v14, v6
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5
+; GFX8-NEXT: v_mul_hi_u32 v7, v13, v6
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7
+; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7
+; GFX8-NEXT: v_addc_u32_e64 v18, s[2:3], 0, v10, s[2:3]
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 1, v17
+; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v18, vcc
+; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v15
+; GFX8-NEXT: v_mul_hi_u32 v6, v14, v6
+; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v4, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4
-; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v2
-; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0
-; GFX8-NEXT: v_addc_u32_e32 v15, vcc, v15, v3, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v13, v17, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, v5
-; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v15, v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e32 v13, v14, v18, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s3, v12, v[5:6]
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v3, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v8, v13, s[0:1]
-; GFX8-NEXT: v_mul_lo_u32 v7, v15, v4
-; GFX8-NEXT: v_mul_lo_u32 v8, v12, v5
-; GFX8-NEXT: v_mul_hi_u32 v9, v12, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v19, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v10, v11, v20, vcc
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
-; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v2
+; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v14, v4, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v13, 0
+; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v17, v9, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s8, v14, v[5:6]
+; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1]
+; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s9, v13, v[6:7]
+; GFX8-NEXT: v_mul_lo_u32 v6, v14, v4
+; GFX8-NEXT: v_mul_hi_u32 v9, v13, v4
+; GFX8-NEXT: v_mul_lo_u32 v7, v13, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v18, v12, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v15, v19, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v16, v20, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v9, v15, v5
-; GFX8-NEXT: v_mul_hi_u32 v4, v15, v4
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7
-; GFX8-NEXT: v_mul_hi_u32 v8, v12, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v9
+; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v9, v14, v8
+; GFX8-NEXT: v_mul_hi_u32 v4, v14, v4
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6
+; GFX8-NEXT: v_mul_hi_u32 v7, v13, v8
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v9, v4
; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8
-; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v9, v8
-; GFX8-NEXT: v_mul_hi_u32 v5, v15, v5
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v7
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v12, v4
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v15, v5, vcc
-; GFX8-NEXT: v_mul_lo_u32 v7, s11, v4
-; GFX8-NEXT: v_mul_lo_u32 v8, s10, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v6, s[0:1]
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7
+; GFX8-NEXT: v_mul_hi_u32 v8, v14, v8
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v13, v4
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v14, v6, vcc
+; GFX8-NEXT: v_mul_lo_u32 v8, s11, v4
+; GFX8-NEXT: v_mul_lo_u32 v9, s10, v7
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v5, s[0:1]
; GFX8-NEXT: v_mul_hi_u32 v1, s10, v4
; GFX8-NEXT: v_mul_hi_u32 v4, s11, v4
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v8, v9
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v7, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v5, v1
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v7, s11, v5
+; GFX8-NEXT: v_mul_lo_u32 v5, s11, v7
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v8, v1
-; GFX8-NEXT: v_mul_hi_u32 v8, s10, v5
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX8-NEXT: v_mul_hi_u32 v8, s10, v7
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4
+; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v8
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v4, v1
-; GFX8-NEXT: v_mul_hi_u32 v8, s11, v5
-; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s14, v11, 0
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v7, v1
-; GFX8-NEXT: v_add_u32_e32 v12, vcc, v8, v1
-; GFX8-NEXT: v_mov_b32_e32 v1, v5
-; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s14, v12, v[1:2]
+; GFX8-NEXT: v_mul_hi_u32 v7, s11, v7
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v5, v1
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s14, v11, 0
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, v7, v1
+; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s14, v12, v[5:6]
; GFX8-NEXT: v_cndmask_b32_e64 v7, v0, v10, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v5, s15
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s15, v11, v[8:9]
@@ -1274,65 +1272,66 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT: v_mov_b32_e32 v9, s5
+; GFX9-NEXT: s_sub_u32 s8, 0, s6
+; GFX9-NEXT: s_subb_u32 s9, 0, s7
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX9-NEXT: v_trunc_f32_e32 v2, v1
-; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
-; GFX9-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
-; GFX9-NEXT: v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT: s_sub_u32 s2, 0, s6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1
-; GFX9-NEXT: s_subb_u32 s3, 0, s7
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
+; GFX9-NEXT: v_trunc_f32_e32 v1, v1
+; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1
+; GFX9-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v1
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT: v_mul_hi_u32 v4, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
+; GFX9-NEXT: v_add_u32_e32 v1, v3, v1
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v4
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
; GFX9-NEXT: v_mul_lo_u32 v2, s17, v0
; GFX9-NEXT: v_mul_lo_u32 v3, s16, v1
; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0
@@ -1349,135 +1348,132 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v7, 0
+; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v8, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT: v_add3_u32 v8, v3, v2, v5
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s4, v8, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v7, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v3, s17
+; GFX9-NEXT: v_add3_u32 v10, v3, v2, v5
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s4, v10, v[1:2]
; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s16, v0
-; GFX9-NEXT: v_subb_co_u32_e64 v0, s[0:1], v3, v2, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v8, v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, s17
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s6
+; GFX9-NEXT: v_subb_co_u32_e64 v0, s[0:1], v2, v4, vcc
+; GFX9-NEXT: v_sub_u32_e32 v2, s17, v4
+; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s7
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX9-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4
+; GFX9-NEXT: v_add_f32_e32 v4, v4, v5
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v4
+; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v9, v3, v4, s[0:1]
-; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7
-; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6
-; GFX9-NEXT: v_sub_u32_e32 v2, s17, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v5, vcc
-; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v3
-; GFX9-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s4, v1
-; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v6, vcc
-; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v9, vcc
+; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v4
+; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v6, s[0:1]
; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
-; GFX9-NEXT: v_trunc_f32_e32 v4, v3
-; GFX9-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4
-; GFX9-NEXT: v_add_f32_e32 v2, v3, v2
-; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v2
-; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v7
-; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v8, s[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v4
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v11
-; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v5, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v10
-; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v11
-; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1]
-; GFX9-NEXT: v_mul_lo_u32 v4, v15, v2
-; GFX9-NEXT: v_mul_lo_u32 v17, v12, v3
-; GFX9-NEXT: v_mul_hi_u32 v5, v12, v2
-; GFX9-NEXT: v_mul_hi_u32 v2, v15, v2
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v17
-; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v5, v15, v3
-; GFX9-NEXT: v_add_u32_e32 v4, v17, v4
-; GFX9-NEXT: v_mul_hi_u32 v17, v12, v3
-; GFX9-NEXT: v_mul_hi_u32 v3, v15, v3
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v17
-; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX9-NEXT: v_add_u32_e32 v5, v5, v17
-; GFX9-NEXT: v_add_co_u32_e32 v17, vcc, 1, v13
-; GFX9-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v14, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
-; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v2
-; GFX9-NEXT: v_add3_u32 v3, v5, v4, v3
-; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0
-; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v3, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, v5
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, v[2:3]
-; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v10
-; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v6, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s3, v12, v[2:3]
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v13, s[0:1]
-; GFX9-NEXT: v_mul_lo_u32 v6, v15, v4
-; GFX9-NEXT: v_mul_lo_u32 v7, v12, v5
-; GFX9-NEXT: v_mul_hi_u32 v9, v12, v4
-; GFX9-NEXT: v_mul_hi_u32 v4, v15, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v14, v14, v18, vcc
-; GFX9-NEXT: v_add_co_u32_e64 v6, s[2:3], v6, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[2:3]
-; GFX9-NEXT: v_add_co_u32_e64 v6, s[2:3], v6, v9
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3]
-; GFX9-NEXT: v_mul_lo_u32 v9, v15, v5
-; GFX9-NEXT: v_add_u32_e32 v6, v7, v6
-; GFX9-NEXT: v_mul_hi_u32 v7, v12, v5
-; GFX9-NEXT: v_mul_hi_u32 v5, v15, v5
-; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v9, v4
-; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[2:3]
-; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v4, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[2:3]
-; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v4, v6
+; GFX9-NEXT: v_trunc_f32_e32 v3, v3
+; GFX9-NEXT: v_mul_f32_e32 v4, 0xcf800000, v3
+; GFX9-NEXT: v_add_f32_e32 v2, v4, v2
+; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v2
+; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v3
+; GFX9-NEXT: v_subrev_co_u32_e32 v15, vcc, s4, v1
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v13, 0
+; GFX9-NEXT: v_subbrev_co_u32_e64 v16, s[0:1], 0, v12, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v14, v[3:4]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v16
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s9, v13, v[4:5]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v15
+; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX9-NEXT: v_mul_lo_u32 v5, v14, v2
+; GFX9-NEXT: v_mul_lo_u32 v7, v13, v6
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v16
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GFX9-NEXT: v_mul_hi_u32 v4, v13, v2
+; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v7
+; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1]
+; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], v5, v4
+; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v12, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1]
+; GFX9-NEXT: v_mul_hi_u32 v2, v14, v2
+; GFX9-NEXT: v_mul_lo_u32 v9, v14, v6
+; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
+; GFX9-NEXT: v_mul_hi_u32 v7, v13, v6
+; GFX9-NEXT: v_add_co_u32_e64 v17, s[2:3], 1, v8
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7
+; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT: v_addc_co_u32_e64 v18, s[2:3], 0, v10, s[2:3]
; GFX9-NEXT: v_add_u32_e32 v7, v9, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3]
-; GFX9-NEXT: v_add3_u32 v5, v7, v6, v5
-; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v12, v4
-; GFX9-NEXT: v_addc_co_u32_e64 v5, s[2:3], v15, v5, s[2:3]
-; GFX9-NEXT: v_mul_lo_u32 v6, s19, v4
-; GFX9-NEXT: v_mul_lo_u32 v7, s18, v5
-; GFX9-NEXT: v_mul_hi_u32 v9, s18, v4
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v14, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v19, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v20, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, 1, v17
+; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v18, vcc
+; GFX9-NEXT: v_mul_hi_u32 v6, v14, v6
+; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v15
+; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v4, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
+; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v4, v7, v4, v6
+; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v13, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, v14, v4, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v13, 0
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v9, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s8, v14, v[5:6]
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s9, v13, v[6:7]
+; GFX9-NEXT: v_mul_lo_u32 v6, v14, v4
+; GFX9-NEXT: v_mul_hi_u32 v9, v13, v4
+; GFX9-NEXT: v_mul_lo_u32 v7, v13, v8
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v12, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v15, v19, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v16, v20, vcc
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7
; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v9
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v9, s19, v5
-; GFX9-NEXT: v_mul_hi_u32 v4, s19, v4
+; GFX9-NEXT: v_mul_lo_u32 v9, v14, v8
+; GFX9-NEXT: v_mul_hi_u32 v4, v14, v4
; GFX9-NEXT: v_add_u32_e32 v6, v7, v6
-; GFX9-NEXT: v_mul_hi_u32 v7, s18, v5
-; GFX9-NEXT: v_mul_hi_u32 v13, s19, v5
+; GFX9-NEXT: v_mul_hi_u32 v7, v13, v8
+; GFX9-NEXT: v_mul_hi_u32 v8, v14, v8
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v9, v4
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7
; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v4, v6
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9-NEXT: v_add_u32_e32 v7, v9, v7
+; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v6, v7, v6, v8
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v13, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v14, v6, vcc
+; GFX9-NEXT: v_mul_lo_u32 v8, s19, v4
+; GFX9-NEXT: v_mul_lo_u32 v9, s18, v7
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v1, v5, s[0:1]
+; GFX9-NEXT: v_mul_hi_u32 v1, s18, v4
+; GFX9-NEXT: v_mul_hi_u32 v4, s19, v4
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v8, v9
+; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v5, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v5, s19, v7
+; GFX9-NEXT: v_add_u32_e32 v1, v8, v1
+; GFX9-NEXT: v_mul_hi_u32 v8, s18, v7
+; GFX9-NEXT: v_mul_hi_u32 v7, s19, v7
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4
+; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8
+; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v4, v1
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s6, v11, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v1, v8, s[0:1]
-; GFX9-NEXT: v_add_u32_e32 v1, v9, v7
-; GFX9-NEXT: v_add3_u32 v12, v1, v12, v13
-; GFX9-NEXT: v_mov_b32_e32 v1, v5
-; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s6, v12, v[1:2]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add_u32_e32 v8, v9, v8
+; GFX9-NEXT: v_add3_u32 v12, v8, v1, v7
+; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s6, v12, v[5:6]
; GFX9-NEXT: v_cndmask_b32_e64 v7, v0, v10, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v5, s7
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s7, v11, v[8:9]
@@ -1546,14 +1542,14 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX10-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
; GFX10-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0
; GFX10-NEXT: v_mul_f32_e32 v3, 0x2f800000, v1
-; GFX10-NEXT: v_trunc_f32_e32 v4, v2
-; GFX10-NEXT: v_trunc_f32_e32 v5, v3
-; GFX10-NEXT: v_mul_f32_e32 v2, 0xcf800000, v4
-; GFX10-NEXT: v_mul_f32_e32 v3, 0xcf800000, v5
-; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v4
-; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v5
-; GFX10-NEXT: v_add_f32_e32 v0, v2, v0
-; GFX10-NEXT: v_add_f32_e32 v1, v3, v1
+; GFX10-NEXT: v_trunc_f32_e32 v2, v2
+; GFX10-NEXT: v_trunc_f32_e32 v4, v3
+; GFX10-NEXT: v_mul_f32_e32 v3, 0xcf800000, v2
+; GFX10-NEXT: v_mul_f32_e32 v5, 0xcf800000, v4
+; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v2
+; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v4
+; GFX10-NEXT: v_add_f32_e32 v0, v3, v0
+; GFX10-NEXT: v_add_f32_e32 v1, v5, v1
; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v1
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s1, v7, 0
@@ -1662,119 +1658,119 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX10-NEXT: v_mul_hi_u32 v4, s17, v4
; GFX10-NEXT: v_mul_lo_u32 v9, s17, v2
; GFX10-NEXT: v_mul_lo_u32 v6, s19, v1
-; GFX10-NEXT: v_mul_hi_u32 v10, s16, v2
-; GFX10-NEXT: v_mul_hi_u32 v11, s17, v2
-; GFX10-NEXT: v_mul_lo_u32 v2, s18, v0
+; GFX10-NEXT: v_mul_lo_u32 v11, s18, v0
; GFX10-NEXT: v_mul_hi_u32 v7, s18, v1
; GFX10-NEXT: v_mul_hi_u32 v1, s19, v1
; GFX10-NEXT: v_mul_lo_u32 v12, s19, v0
-; GFX10-NEXT: v_mul_hi_u32 v13, s18, v0
-; GFX10-NEXT: v_mul_hi_u32 v14, s19, v0
-; GFX10-NEXT: v_add_co_u32 v0, s0, v3, v8
-; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
-; GFX10-NEXT: v_add_co_u32 v4, s0, v9, v4
+; GFX10-NEXT: v_add_co_u32 v3, s0, v3, v8
+; GFX10-NEXT: v_mul_hi_u32 v10, s16, v2
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0
-; GFX10-NEXT: v_add_co_u32 v2, s0, v6, v2
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT: v_add_co_u32 v1, s0, v12, v1
+; GFX10-NEXT: v_add_co_u32 v4, s0, v9, v4
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0
-; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v11
+; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s0
+; GFX10-NEXT: v_add_co_u32 v1, s0, v12, v1
+; GFX10-NEXT: v_mul_hi_u32 v13, s18, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s0
+; GFX10-NEXT: v_add_co_u32 v3, s0, v3, v5
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v10
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0
-; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v7
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
-; GFX10-NEXT: v_add_nc_u32_e32 v0, v3, v0
+; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v7
+; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v13
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0
-; GFX10-NEXT: v_add_nc_u32_e32 v2, v6, v2
-; GFX10-NEXT: v_add_nc_u32_e32 v5, v8, v5
-; GFX10-NEXT: v_add_co_u32 v8, s0, v4, v0
+; GFX10-NEXT: v_mul_hi_u32 v10, s19, v0
+; GFX10-NEXT: v_add_nc_u32_e32 v0, v8, v3
+; GFX10-NEXT: v_add_nc_u32_e32 v3, v9, v5
+; GFX10-NEXT: v_add_nc_u32_e32 v5, v11, v6
+; GFX10-NEXT: v_mul_hi_u32 v2, s17, v2
+; GFX10-NEXT: v_add_nc_u32_e32 v6, v12, v7
+; GFX10-NEXT: v_add_co_u32 v7, s0, v4, v0
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0
-; GFX10-NEXT: v_add_co_u32 v10, s0, v1, v2
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s4, v8, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s6, v10, 0
-; GFX10-NEXT: v_add_nc_u32_e32 v7, v9, v7
-; GFX10-NEXT: v_add3_u32 v9, v5, v4, v11
-; GFX10-NEXT: v_add_co_u32 v12, vcc_lo, v8, 1
-; GFX10-NEXT: v_mov_b32_e32 v11, 0
-; GFX10-NEXT: v_add3_u32 v7, v7, v6, v14
+; GFX10-NEXT: v_add_co_u32 v8, s0, v1, v5
+; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s4, v7, 0
+; GFX10-NEXT: v_add3_u32 v9, v3, v4, v2
+; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s6, v8, 0
+; GFX10-NEXT: v_add3_u32 v10, v6, v5, v10
+; GFX10-NEXT: v_mov_b32_e32 v12, 0
; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s4, v9, v[1:2]
-; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v9, vcc_lo
-; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s6, v7, v[3:4]
-; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s5, v8, v[4:5]
-; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v12, 1
-; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v13, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v7, 1
+; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v9, vcc_lo
+; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s6, v10, v[3:4]
+; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v1, 1
+; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v11, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, s16, v0
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s7, v10, v[5:6]
-; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s0, s17, v3, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4, v14
-; GFX10-NEXT: v_sub_nc_u32_e32 v1, s17, v3
-; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, -1, s0
-; GFX10-NEXT: v_sub_co_u32 v15, s0, s18, v2
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo
-; GFX10-NEXT: v_sub_co_ci_u32_e64 v16, s1, s19, v0, s0
-; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v15
-; GFX10-NEXT: v_sub_nc_u32_e32 v0, s19, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v17, vcc_lo, v14, s4
-; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, s1, 0, v1, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e64 s1, s5, v5
-; GFX10-NEXT: v_subrev_co_ci_u32_e64 v23, s0, s7, v0, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, v18
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo
+; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s5, v7, v[4:5]
+; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s7, v8, v[5:6]
+; GFX10-NEXT: v_sub_nc_u32_e32 v5, s17, v3
+; GFX10-NEXT: v_sub_co_ci_u32_e64 v3, s0, s17, v3, vcc_lo
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s5, v5, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v14
+; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc_lo
+; GFX10-NEXT: v_sub_co_u32 v15, vcc_lo, v14, s4
+; GFX10-NEXT: v_subrev_co_ci_u32_e64 v16, s0, 0, v0, vcc_lo
+; GFX10-NEXT: v_sub_co_u32 v17, s0, s18, v2
+; GFX10-NEXT: v_sub_co_ci_u32_e64 v18, s1, s19, v4, s0
+; GFX10-NEXT: v_cmp_le_u32_e64 s1, s4, v15
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s5, v0, vcc_lo
+; GFX10-NEXT: v_sub_nc_u32_e32 v4, s19, v4
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, s1
+; GFX10-NEXT: v_cmp_le_u32_e64 s1, s5, v16
; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, -1, s1
-; GFX10-NEXT: v_cmp_le_u32_e64 s1, s4, v17
+; GFX10-NEXT: v_cmp_le_u32_e64 s1, s5, v3
; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, -1, s1
-; GFX10-NEXT: v_cmp_le_u32_e64 s1, s5, v18
-; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, -1, s1
-; GFX10-NEXT: v_cmp_le_u32_e64 s1, s7, v16
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v21, v20, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, -1, s1
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0
-; GFX10-NEXT: v_sub_co_u32 v0, s0, v17, s4
-; GFX10-NEXT: v_subrev_co_ci_u32_e64 v19, s0, 0, v1, s0
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v6, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v6, s1, v15, s6
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v17, v0, vcc_lo
-; GFX10-NEXT: v_subrev_co_ci_u32_e64 v12, s2, 0, v23, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v1, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v3, s0
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v16
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v4, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v3, s0
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v22, v2, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v12
-; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v6
+; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s5, v16
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v19, v2, s1
+; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s5, v3
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v5, v20, v5, s1
+; GFX10-NEXT: v_sub_co_u32 v2, s1, v15, s4
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e64 s2, 0, v5
+; GFX10-NEXT: v_subrev_co_ci_u32_e64 v6, s1, 0, v0, s1
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e64 s1, s7, v18
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v1, s2
+; GFX10-NEXT: v_subrev_co_ci_u32_e64 v7, s0, s7, v4, s0
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v17
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v5, s2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v15, v2, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, s1
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s0
+; GFX10-NEXT: v_sub_co_u32 v11, s0, v17, s6
+; GFX10-NEXT: v_subrev_co_ci_u32_e64 v13, s1, 0, v7, s0
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v18
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v2, s2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v9, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v13
+; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v11
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v13, vcc_lo, v10, 1
-; GFX10-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, 0, v7, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v12
-; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v13, 1
-; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, 0, v14, vcc_lo
-; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, vcc_lo, s7, v23, s1
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX10-NEXT: v_sub_co_u32 v8, s1, v6, s6
-; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, s1, 0, v18, s1
-; GFX10-NEXT: v_cndmask_b32_e32 v9, v13, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v13, v14, v17, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v8, v12, v18, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v9, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v13, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v15, v6, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v8, s1
-; GFX10-NEXT: global_store_dwordx4 v11, v[0:3], s[12:13]
-; GFX10-NEXT: global_store_dwordx4 v11, v[4:7], s[14:15]
+; GFX10-NEXT: v_add_co_u32 v14, vcc_lo, v8, 1
+; GFX10-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, 0, v10, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v14, 1
+; GFX10-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, 0, v15, vcc_lo
+; GFX10-NEXT: v_subrev_co_ci_u32_e64 v7, vcc_lo, s7, v7, s0
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v5
+; GFX10-NEXT: v_sub_co_u32 v5, s0, v11, s6
+; GFX10-NEXT: v_subrev_co_ci_u32_e64 v7, s0, 0, v7, s0
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v14, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v5, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v13, v7, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v5, v3, v6, s2
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v8, v9, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v10, v14, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v17, v11, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v18, v7, s0
+; GFX10-NEXT: global_store_dwordx4 v12, v[0:3], s[12:13]
+; GFX10-NEXT: global_store_dwordx4 v12, v[4:7], s[14:15]
; GFX10-NEXT: s_endpgm
%div = udiv <2 x i64> %x, %y
store <2 x i64> %div, ptr addrspace(1) %out0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
index b33b8a7..4a22a91 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -272,10 +272,6 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
-; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
-; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[0:1] offset:64
; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[0:1] offset:80
; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[0:1] offset:96
@@ -288,6 +284,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[0:1] offset:208
; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[0:1] offset:224
; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] offset:240
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
+; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
+; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB6_2
; GFX906-NEXT: ; %bb.1: ; %bb.1