aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll1400
1 files changed, 659 insertions, 741 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index e27164c..51df8c3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -6191,37 +6191,34 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3
-; GFX6-NEXT: s_ashr_i32 s8, s3, 31
-; GFX6-NEXT: s_add_i32 s3, s3, s8
-; GFX6-NEXT: s_xor_b32 s3, s3, s8
-; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX6-NEXT: s_sub_i32 s4, 0, s3
-; GFX6-NEXT: s_ashr_i32 s9, s2, 31
-; GFX6-NEXT: s_add_i32 s2, s2, s9
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT: s_xor_b32 s2, s2, s9
+; GFX6-NEXT: s_abs_i32 s8, s3
+; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8
+; GFX6-NEXT: s_sub_i32 s4, 0, s8
+; GFX6-NEXT: s_abs_i32 s9, s2
; GFX6-NEXT: s_mov_b32 s5, s1
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0
; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0
+; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: s_mul_i32 s0, s0, s3
-; GFX6-NEXT: s_sub_i32 s0, s2, s0
-; GFX6-NEXT: s_sub_i32 s1, s0, s3
+; GFX6-NEXT: s_mul_i32 s0, s0, s8
+; GFX6-NEXT: s_sub_i32 s0, s9, s0
+; GFX6-NEXT: s_sub_i32 s1, s0, s8
; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT: s_cmp_ge_u32 s0, s3
+; GFX6-NEXT: s_cmp_ge_u32 s0, s8
; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
; GFX6-NEXT: s_cselect_b32 s0, s1, s0
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT: s_cmp_ge_u32 s0, s3
+; GFX6-NEXT: s_cmp_ge_u32 s0, s8
; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT: s_xor_b32 s0, s2, s3
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT: s_xor_b32 s0, s9, s8
+; GFX6-NEXT: s_ashr_i32 s0, s0, 31
; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -6233,35 +6230,32 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3
-; GFX9-NEXT: s_ashr_i32 s4, s3, 31
-; GFX9-NEXT: s_add_i32 s3, s3, s4
-; GFX9-NEXT: s_xor_b32 s3, s3, s4
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT: s_sub_i32 s6, 0, s3
-; GFX9-NEXT: s_ashr_i32 s5, s2, 31
-; GFX9-NEXT: s_add_i32 s2, s2, s5
+; GFX9-NEXT: s_abs_i32 s4, s3
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT: s_sub_i32 s6, 0, s4
+; GFX9-NEXT: s_abs_i32 s5, s2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_xor_b32 s2, s2, s5
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_readfirstlane_b32 s7, v0
; GFX9-NEXT: s_mul_i32 s6, s6, s7
; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6
; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_mul_hi_u32 s6, s2, s7
-; GFX9-NEXT: s_mul_i32 s8, s6, s3
-; GFX9-NEXT: s_sub_i32 s2, s2, s8
+; GFX9-NEXT: s_mul_hi_u32 s6, s5, s7
+; GFX9-NEXT: s_mul_i32 s8, s6, s4
+; GFX9-NEXT: s_sub_i32 s5, s5, s8
; GFX9-NEXT: s_add_i32 s7, s6, 1
-; GFX9-NEXT: s_sub_i32 s8, s2, s3
-; GFX9-NEXT: s_cmp_ge_u32 s2, s3
+; GFX9-NEXT: s_sub_i32 s8, s5, s4
+; GFX9-NEXT: s_cmp_ge_u32 s5, s4
; GFX9-NEXT: s_cselect_b32 s6, s7, s6
-; GFX9-NEXT: s_cselect_b32 s2, s8, s2
+; GFX9-NEXT: s_cselect_b32 s5, s8, s5
; GFX9-NEXT: s_add_i32 s7, s6, 1
-; GFX9-NEXT: s_cmp_ge_u32 s2, s3
-; GFX9-NEXT: s_cselect_b32 s2, s7, s6
-; GFX9-NEXT: s_xor_b32 s3, s5, s4
+; GFX9-NEXT: s_cmp_ge_u32 s5, s4
+; GFX9-NEXT: s_cselect_b32 s4, s7, s6
; GFX9-NEXT: s_xor_b32 s2, s2, s3
-; GFX9-NEXT: s_sub_i32 s2, s2, s3
+; GFX9-NEXT: s_ashr_i32 s2, s2, 31
+; GFX9-NEXT: s_xor_b32 s3, s4, s2
+; GFX9-NEXT: s_sub_i32 s2, s3, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
@@ -6706,38 +6700,37 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
; GFX6-LABEL: srem_i32_pow2_shl_denom:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3
-; GFX6-NEXT: s_ashr_i32 s4, s3, 31
-; GFX6-NEXT: s_add_i32 s3, s3, s4
-; GFX6-NEXT: s_xor_b32 s4, s3, s4
-; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4
-; GFX6-NEXT: s_sub_i32 s3, 0, s4
-; GFX6-NEXT: s_ashr_i32 s5, s2, 31
-; GFX6-NEXT: s_add_i32 s2, s2, s5
+; GFX6-NEXT: s_abs_i32 s3, s3
+; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3
+; GFX6-NEXT: s_sub_i32 s4, 0, s3
+; GFX6-NEXT: s_abs_i32 s8, s2
+; GFX6-NEXT: s_mov_b32 s5, s1
; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT: s_xor_b32 s6, s2, s5
-; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0
-; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0
+; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
-; GFX6-NEXT: v_readfirstlane_b32 s7, v0
-; GFX6-NEXT: s_mul_i32 s7, s7, s4
-; GFX6-NEXT: s_sub_i32 s6, s6, s7
-; GFX6-NEXT: s_sub_i32 s7, s6, s4
-; GFX6-NEXT: s_cmp_ge_u32 s6, s4
-; GFX6-NEXT: s_cselect_b32 s6, s7, s6
-; GFX6-NEXT: s_sub_i32 s7, s6, s4
-; GFX6-NEXT: s_cmp_ge_u32 s6, s4
-; GFX6-NEXT: s_cselect_b32 s4, s7, s6
-; GFX6-NEXT: s_xor_b32 s4, s4, s5
-; GFX6-NEXT: s_sub_i32 s4, s4, s5
-; GFX6-NEXT: v_mov_b32_e32 v0, s4
-; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
+; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: s_mul_i32 s0, s0, s3
+; GFX6-NEXT: s_sub_i32 s0, s8, s0
+; GFX6-NEXT: s_sub_i32 s1, s0, s3
+; GFX6-NEXT: s_cmp_ge_u32 s0, s3
+; GFX6-NEXT: s_cselect_b32 s0, s1, s0
+; GFX6-NEXT: s_sub_i32 s1, s0, s3
+; GFX6-NEXT: s_cmp_ge_u32 s0, s3
+; GFX6-NEXT: s_cselect_b32 s0, s1, s0
+; GFX6-NEXT: s_ashr_i32 s1, s2, 31
+; GFX6-NEXT: s_xor_b32 s0, s0, s1
+; GFX6-NEXT: s_sub_i32 s0, s0, s1
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
+; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: srem_i32_pow2_shl_denom:
@@ -6746,32 +6739,29 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3
-; GFX9-NEXT: s_ashr_i32 s4, s3, 31
-; GFX9-NEXT: s_add_i32 s3, s3, s4
-; GFX9-NEXT: s_xor_b32 s3, s3, s4
+; GFX9-NEXT: s_abs_i32 s3, s3
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
; GFX9-NEXT: s_sub_i32 s5, 0, s3
-; GFX9-NEXT: s_ashr_i32 s4, s2, 31
-; GFX9-NEXT: s_add_i32 s2, s2, s4
+; GFX9-NEXT: s_abs_i32 s4, s2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_xor_b32 s2, s2, s4
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_readfirstlane_b32 s6, v0
; GFX9-NEXT: s_mul_i32 s5, s5, s6
; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5
; GFX9-NEXT: s_add_i32 s6, s6, s5
-; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6
+; GFX9-NEXT: s_mul_hi_u32 s5, s4, s6
; GFX9-NEXT: s_mul_i32 s5, s5, s3
-; GFX9-NEXT: s_sub_i32 s2, s2, s5
-; GFX9-NEXT: s_sub_i32 s5, s2, s3
-; GFX9-NEXT: s_cmp_ge_u32 s2, s3
-; GFX9-NEXT: s_cselect_b32 s2, s5, s2
-; GFX9-NEXT: s_sub_i32 s5, s2, s3
-; GFX9-NEXT: s_cmp_ge_u32 s2, s3
-; GFX9-NEXT: s_cselect_b32 s2, s5, s2
-; GFX9-NEXT: s_xor_b32 s2, s2, s4
-; GFX9-NEXT: s_sub_i32 s2, s2, s4
+; GFX9-NEXT: s_sub_i32 s4, s4, s5
+; GFX9-NEXT: s_sub_i32 s5, s4, s3
+; GFX9-NEXT: s_cmp_ge_u32 s4, s3
+; GFX9-NEXT: s_cselect_b32 s4, s5, s4
+; GFX9-NEXT: s_sub_i32 s5, s4, s3
+; GFX9-NEXT: s_cmp_ge_u32 s4, s3
+; GFX9-NEXT: s_cselect_b32 s3, s5, s4
+; GFX9-NEXT: s_ashr_i32 s2, s2, 31
+; GFX9-NEXT: s_xor_b32 s3, s3, s2
+; GFX9-NEXT: s_sub_i32 s2, s3, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
@@ -7831,10 +7821,9 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_addc_u32 s15, 0, s16
; GFX6-NEXT: s_add_u32 s16, s0, s1
; GFX6-NEXT: v_mov_b32_e32 v0, s16
-; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_cmp_lg_u32 s0, 0
; GFX6-NEXT: s_addc_u32 s14, s14, s15
; GFX6-NEXT: s_mul_i32 s0, s12, s14
; GFX6-NEXT: v_readfirstlane_b32 s1, v0
@@ -7865,7 +7854,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_add_u32 s15, s16, s0
; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_cmp_lg_u32 s0, 0
; GFX6-NEXT: s_addc_u32 s14, s14, s12
; GFX6-NEXT: s_ashr_i32 s12, s7, 31
; GFX6-NEXT: s_add_u32 s0, s6, s12
@@ -7891,52 +7879,50 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: v_readfirstlane_b32 s4, v0
; GFX6-NEXT: s_addc_u32 s4, s4, 0
; GFX6-NEXT: s_mul_i32 s14, s7, s14
-; GFX6-NEXT: s_add_u32 s14, s1, s14
-; GFX6-NEXT: v_mov_b32_e32 v0, s14
+; GFX6-NEXT: s_add_u32 s16, s1, s14
+; GFX6-NEXT: v_mov_b32_e32 v0, s16
; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0
-; GFX6-NEXT: s_addc_u32 s15, 0, s4
+; GFX6-NEXT: s_addc_u32 s17, 0, s4
; GFX6-NEXT: s_mov_b32 s1, s5
-; GFX6-NEXT: s_mul_i32 s4, s10, s15
+; GFX6-NEXT: s_mul_i32 s4, s10, s17
; GFX6-NEXT: v_readfirstlane_b32 s5, v0
; GFX6-NEXT: s_add_i32 s4, s5, s4
-; GFX6-NEXT: s_mul_i32 s5, s11, s14
-; GFX6-NEXT: s_add_i32 s16, s4, s5
-; GFX6-NEXT: s_sub_i32 s17, s7, s16
-; GFX6-NEXT: s_mul_i32 s4, s10, s14
+; GFX6-NEXT: s_mul_i32 s5, s11, s16
+; GFX6-NEXT: s_add_i32 s18, s4, s5
+; GFX6-NEXT: s_sub_i32 s14, s7, s18
+; GFX6-NEXT: s_mul_i32 s4, s10, s16
; GFX6-NEXT: s_sub_u32 s6, s6, s4
; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT: s_or_b32 s18, s4, s5
-; GFX6-NEXT: s_cmp_lg_u32 s18, 0
-; GFX6-NEXT: s_subb_u32 s17, s17, s11
-; GFX6-NEXT: s_sub_u32 s19, s6, s10
-; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX6-NEXT: s_or_b32 s15, s4, s5
+; GFX6-NEXT: s_subb_u32 s19, s14, s11
+; GFX6-NEXT: s_sub_u32 s20, s6, s10
+; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT: s_or_b32 s14, s14, s15
+; GFX6-NEXT: s_subb_u32 s14, s19, 0
+; GFX6-NEXT: s_cmp_ge_u32 s14, s11
+; GFX6-NEXT: s_cselect_b32 s15, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s20, s10
+; GFX6-NEXT: s_cselect_b32 s19, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s14, s11
+; GFX6-NEXT: s_cselect_b32 s14, s19, s15
+; GFX6-NEXT: s_add_u32 s15, s16, 1
+; GFX6-NEXT: s_addc_u32 s19, s17, 0
+; GFX6-NEXT: s_add_u32 s20, s16, 2
+; GFX6-NEXT: s_addc_u32 s21, s17, 0
+; GFX6-NEXT: s_cmp_lg_u32 s14, 0
+; GFX6-NEXT: s_cselect_b32 s14, s20, s15
+; GFX6-NEXT: s_cselect_b32 s15, s21, s19
; GFX6-NEXT: s_or_b32 s4, s4, s5
-; GFX6-NEXT: s_cmp_lg_u32 s4, 0
-; GFX6-NEXT: s_subb_u32 s4, s17, 0
+; GFX6-NEXT: s_subb_u32 s4, s7, s18
; GFX6-NEXT: s_cmp_ge_u32 s4, s11
; GFX6-NEXT: s_cselect_b32 s5, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s19, s10
-; GFX6-NEXT: s_cselect_b32 s17, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s4, s11
-; GFX6-NEXT: s_cselect_b32 s4, s17, s5
-; GFX6-NEXT: s_add_u32 s5, s14, 1
-; GFX6-NEXT: s_addc_u32 s17, s15, 0
-; GFX6-NEXT: s_add_u32 s19, s14, 2
-; GFX6-NEXT: s_addc_u32 s20, s15, 0
-; GFX6-NEXT: s_cmp_lg_u32 s4, 0
-; GFX6-NEXT: s_cselect_b32 s4, s19, s5
-; GFX6-NEXT: s_cselect_b32 s5, s20, s17
-; GFX6-NEXT: s_cmp_lg_u32 s18, 0
-; GFX6-NEXT: s_subb_u32 s7, s7, s16
-; GFX6-NEXT: s_cmp_ge_u32 s7, s11
-; GFX6-NEXT: s_cselect_b32 s16, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s6, s10
; GFX6-NEXT: s_cselect_b32 s6, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s7, s11
-; GFX6-NEXT: s_cselect_b32 s6, s6, s16
-; GFX6-NEXT: s_cmp_lg_u32 s6, 0
-; GFX6-NEXT: s_cselect_b32 s5, s5, s15
-; GFX6-NEXT: s_cselect_b32 s4, s4, s14
+; GFX6-NEXT: s_cmp_eq_u32 s4, s11
+; GFX6-NEXT: s_cselect_b32 s4, s6, s5
+; GFX6-NEXT: s_cmp_lg_u32 s4, 0
+; GFX6-NEXT: s_cselect_b32 s5, s15, s17
+; GFX6-NEXT: s_cselect_b32 s4, s14, s16
; GFX6-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9]
; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GFX6-NEXT: s_sub_u32 s4, s4, s6
@@ -7959,8 +7945,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: s_sub_u32 s10, 0, s8
-; GFX9-NEXT: s_subb_u32 s11, 0, s9
+; GFX9-NEXT: s_sub_u32 s4, 0, s8
+; GFX9-NEXT: s_subb_u32 s5, 0, s9
; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -7970,56 +7956,52 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s12, v2
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: s_mul_i32 s5, s10, s12
-; GFX9-NEXT: s_mul_hi_u32 s14, s10, s4
-; GFX9-NEXT: s_mul_i32 s13, s11, s4
-; GFX9-NEXT: s_add_i32 s5, s14, s5
-; GFX9-NEXT: s_mul_i32 s15, s10, s4
-; GFX9-NEXT: s_add_i32 s5, s5, s13
-; GFX9-NEXT: s_mul_hi_u32 s14, s4, s15
-; GFX9-NEXT: s_mul_i32 s16, s4, s5
-; GFX9-NEXT: s_mul_hi_u32 s13, s4, s5
+; GFX9-NEXT: v_readfirstlane_b32 s10, v2
+; GFX9-NEXT: v_readfirstlane_b32 s11, v1
+; GFX9-NEXT: s_mul_i32 s12, s4, s10
+; GFX9-NEXT: s_mul_hi_u32 s14, s4, s11
+; GFX9-NEXT: s_mul_i32 s13, s5, s11
+; GFX9-NEXT: s_add_i32 s12, s14, s12
+; GFX9-NEXT: s_mul_i32 s15, s4, s11
+; GFX9-NEXT: s_add_i32 s12, s12, s13
+; GFX9-NEXT: s_mul_hi_u32 s14, s11, s15
+; GFX9-NEXT: s_mul_i32 s16, s11, s12
+; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12
; GFX9-NEXT: s_add_u32 s14, s14, s16
; GFX9-NEXT: s_addc_u32 s13, 0, s13
-; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15
-; GFX9-NEXT: s_mul_i32 s15, s12, s15
+; GFX9-NEXT: s_mul_hi_u32 s17, s10, s15
+; GFX9-NEXT: s_mul_i32 s15, s10, s15
; GFX9-NEXT: s_add_u32 s14, s14, s15
-; GFX9-NEXT: s_mul_hi_u32 s16, s12, s5
+; GFX9-NEXT: s_mul_hi_u32 s16, s10, s12
; GFX9-NEXT: s_addc_u32 s13, s13, s17
; GFX9-NEXT: s_addc_u32 s14, s16, 0
-; GFX9-NEXT: s_mul_i32 s5, s12, s5
-; GFX9-NEXT: s_add_u32 s5, s13, s5
+; GFX9-NEXT: s_mul_i32 s12, s10, s12
+; GFX9-NEXT: s_add_u32 s12, s13, s12
; GFX9-NEXT: s_addc_u32 s13, 0, s14
-; GFX9-NEXT: s_add_u32 s14, s4, s5
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s12, s12, s13
-; GFX9-NEXT: s_mul_i32 s4, s10, s12
-; GFX9-NEXT: s_mul_hi_u32 s5, s10, s14
-; GFX9-NEXT: s_add_i32 s4, s5, s4
-; GFX9-NEXT: s_mul_i32 s11, s11, s14
-; GFX9-NEXT: s_add_i32 s4, s4, s11
-; GFX9-NEXT: s_mul_i32 s10, s10, s14
-; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10
-; GFX9-NEXT: s_mul_i32 s13, s12, s10
-; GFX9-NEXT: s_mul_i32 s16, s14, s4
-; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10
-; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4
-; GFX9-NEXT: s_add_u32 s10, s10, s16
+; GFX9-NEXT: s_add_u32 s11, s11, s12
+; GFX9-NEXT: s_addc_u32 s10, s10, s13
+; GFX9-NEXT: s_mul_i32 s12, s4, s10
+; GFX9-NEXT: s_mul_hi_u32 s13, s4, s11
+; GFX9-NEXT: s_add_i32 s12, s13, s12
+; GFX9-NEXT: s_mul_i32 s5, s5, s11
+; GFX9-NEXT: s_add_i32 s12, s12, s5
+; GFX9-NEXT: s_mul_i32 s4, s4, s11
+; GFX9-NEXT: s_mul_hi_u32 s13, s10, s4
+; GFX9-NEXT: s_mul_i32 s14, s10, s4
+; GFX9-NEXT: s_mul_i32 s16, s11, s12
+; GFX9-NEXT: s_mul_hi_u32 s4, s11, s4
+; GFX9-NEXT: s_mul_hi_u32 s15, s11, s12
+; GFX9-NEXT: s_add_u32 s4, s4, s16
; GFX9-NEXT: s_addc_u32 s15, 0, s15
-; GFX9-NEXT: s_add_u32 s10, s10, s13
-; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4
-; GFX9-NEXT: s_addc_u32 s10, s15, s11
+; GFX9-NEXT: s_add_u32 s4, s4, s14
+; GFX9-NEXT: s_mul_hi_u32 s5, s10, s12
+; GFX9-NEXT: s_addc_u32 s4, s15, s13
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_mul_i32 s4, s12, s4
-; GFX9-NEXT: s_add_u32 s4, s10, s4
-; GFX9-NEXT: s_addc_u32 s10, 0, s5
-; GFX9-NEXT: s_add_u32 s11, s14, s4
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s10, s12, s10
+; GFX9-NEXT: s_mul_i32 s12, s10, s12
+; GFX9-NEXT: s_add_u32 s4, s4, s12
+; GFX9-NEXT: s_addc_u32 s5, 0, s5
+; GFX9-NEXT: s_add_u32 s11, s11, s4
+; GFX9-NEXT: s_addc_u32 s10, s10, s5
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s4, s3, 31
; GFX9-NEXT: s_add_u32 s2, s2, s4
@@ -8038,38 +8020,35 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: s_addc_u32 s11, s12, s15
; GFX9-NEXT: s_addc_u32 s12, s14, 0
; GFX9-NEXT: s_mul_i32 s10, s3, s10
-; GFX9-NEXT: s_add_u32 s14, s11, s10
-; GFX9-NEXT: s_addc_u32 s15, 0, s12
-; GFX9-NEXT: s_mul_i32 s10, s8, s15
-; GFX9-NEXT: s_mul_hi_u32 s11, s8, s14
+; GFX9-NEXT: s_add_u32 s13, s11, s10
+; GFX9-NEXT: s_addc_u32 s12, 0, s12
+; GFX9-NEXT: s_mul_i32 s10, s8, s12
+; GFX9-NEXT: s_mul_hi_u32 s11, s8, s13
; GFX9-NEXT: s_add_i32 s10, s11, s10
-; GFX9-NEXT: s_mul_i32 s11, s9, s14
-; GFX9-NEXT: s_add_i32 s16, s10, s11
-; GFX9-NEXT: s_sub_i32 s12, s3, s16
-; GFX9-NEXT: s_mul_i32 s10, s8, s14
+; GFX9-NEXT: s_mul_i32 s11, s9, s13
+; GFX9-NEXT: s_add_i32 s14, s10, s11
+; GFX9-NEXT: s_sub_i32 s15, s3, s14
+; GFX9-NEXT: s_mul_i32 s10, s8, s13
; GFX9-NEXT: s_sub_u32 s2, s2, s10
; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s17, s12, s9
-; GFX9-NEXT: s_sub_u32 s18, s2, s8
-; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT: s_subb_u32 s12, s17, 0
-; GFX9-NEXT: s_cmp_ge_u32 s12, s9
-; GFX9-NEXT: s_cselect_b32 s13, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s18, s8
+; GFX9-NEXT: s_subb_u32 s15, s15, s9
+; GFX9-NEXT: s_sub_u32 s16, s2, s8
+; GFX9-NEXT: s_subb_u32 s15, s15, 0
+; GFX9-NEXT: s_cmp_ge_u32 s15, s9
; GFX9-NEXT: s_cselect_b32 s17, -1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s12, s9
-; GFX9-NEXT: s_cselect_b32 s12, s17, s13
-; GFX9-NEXT: s_add_u32 s13, s14, 1
-; GFX9-NEXT: s_addc_u32 s17, s15, 0
-; GFX9-NEXT: s_add_u32 s18, s14, 2
-; GFX9-NEXT: s_addc_u32 s19, s15, 0
-; GFX9-NEXT: s_cmp_lg_u32 s12, 0
-; GFX9-NEXT: s_cselect_b32 s12, s18, s13
-; GFX9-NEXT: s_cselect_b32 s13, s19, s17
+; GFX9-NEXT: s_cmp_ge_u32 s16, s8
+; GFX9-NEXT: s_cselect_b32 s16, -1, 0
+; GFX9-NEXT: s_cmp_eq_u32 s15, s9
+; GFX9-NEXT: s_cselect_b32 s15, s16, s17
+; GFX9-NEXT: s_add_u32 s16, s13, 1
+; GFX9-NEXT: s_addc_u32 s17, s12, 0
+; GFX9-NEXT: s_add_u32 s18, s13, 2
+; GFX9-NEXT: s_addc_u32 s19, s12, 0
+; GFX9-NEXT: s_cmp_lg_u32 s15, 0
+; GFX9-NEXT: s_cselect_b32 s15, s18, s16
+; GFX9-NEXT: s_cselect_b32 s16, s19, s17
; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s3, s3, s16
+; GFX9-NEXT: s_subb_u32 s3, s3, s14
; GFX9-NEXT: s_cmp_ge_u32 s3, s9
; GFX9-NEXT: s_cselect_b32 s10, -1, 0
; GFX9-NEXT: s_cmp_ge_u32 s2, s8
@@ -8077,8 +8056,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: s_cmp_eq_u32 s3, s9
; GFX9-NEXT: s_cselect_b32 s2, s2, s10
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
-; GFX9-NEXT: s_cselect_b32 s3, s13, s15
-; GFX9-NEXT: s_cselect_b32 s2, s12, s14
+; GFX9-NEXT: s_cselect_b32 s3, s16, s12
+; GFX9-NEXT: s_cselect_b32 s2, s15, s13
; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
; GFX9-NEXT: s_sub_u32 s2, s2, s4
@@ -8338,10 +8317,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_addc_u32 s17, 0, s18
; GFX6-NEXT: s_add_u32 s18, s12, s13
; GFX6-NEXT: v_mov_b32_e32 v0, s18
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0
+; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
; GFX6-NEXT: s_addc_u32 s16, s16, s17
; GFX6-NEXT: s_mul_i32 s12, s14, s16
; GFX6-NEXT: v_readfirstlane_b32 s13, v0
@@ -8372,7 +8350,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_add_u32 s15, s18, s12
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
; GFX6-NEXT: s_addc_u32 s14, s16, s14
; GFX6-NEXT: s_ashr_i32 s12, s9, 31
; GFX6-NEXT: s_add_u32 s8, s8, s12
@@ -8397,55 +8374,53 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_readfirstlane_b32 s16, v0
; GFX6-NEXT: s_addc_u32 s16, s16, 0
; GFX6-NEXT: s_mul_i32 s14, s9, s14
-; GFX6-NEXT: s_add_u32 s17, s15, s14
-; GFX6-NEXT: v_mov_b32_e32 v0, s17
+; GFX6-NEXT: s_add_u32 s18, s15, s14
+; GFX6-NEXT: v_mov_b32_e32 v0, s18
; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
-; GFX6-NEXT: s_addc_u32 s16, 0, s16
-; GFX6-NEXT: s_mul_i32 s14, s6, s16
+; GFX6-NEXT: s_addc_u32 s19, 0, s16
+; GFX6-NEXT: s_mul_i32 s14, s6, s19
; GFX6-NEXT: v_readfirstlane_b32 s15, v0
; GFX6-NEXT: s_add_i32 s14, s15, s14
-; GFX6-NEXT: s_mul_i32 s15, s7, s17
-; GFX6-NEXT: s_add_i32 s18, s14, s15
-; GFX6-NEXT: s_sub_i32 s19, s9, s18
-; GFX6-NEXT: s_mul_i32 s14, s6, s17
+; GFX6-NEXT: s_mul_i32 s15, s7, s18
+; GFX6-NEXT: s_add_i32 s20, s14, s15
+; GFX6-NEXT: s_sub_i32 s16, s9, s20
+; GFX6-NEXT: s_mul_i32 s14, s6, s18
; GFX6-NEXT: s_sub_u32 s8, s8, s14
; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT: s_or_b32 s20, s14, s15
-; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_subb_u32 s19, s19, s7
-; GFX6-NEXT: s_sub_u32 s21, s8, s6
-; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT: s_or_b32 s17, s14, s15
+; GFX6-NEXT: s_subb_u32 s21, s16, s7
+; GFX6-NEXT: s_sub_u32 s22, s8, s6
+; GFX6-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GFX6-NEXT: s_or_b32 s16, s16, s17
+; GFX6-NEXT: s_subb_u32 s16, s21, 0
+; GFX6-NEXT: s_cmp_ge_u32 s16, s7
+; GFX6-NEXT: s_cselect_b32 s17, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s22, s6
+; GFX6-NEXT: s_cselect_b32 s21, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s16, s7
+; GFX6-NEXT: s_cselect_b32 s16, s21, s17
+; GFX6-NEXT: s_add_u32 s17, s18, 1
+; GFX6-NEXT: s_addc_u32 s21, s19, 0
+; GFX6-NEXT: s_add_u32 s22, s18, 2
+; GFX6-NEXT: s_addc_u32 s23, s19, 0
+; GFX6-NEXT: s_cmp_lg_u32 s16, 0
+; GFX6-NEXT: s_cselect_b32 s16, s22, s17
+; GFX6-NEXT: s_cselect_b32 s17, s23, s21
; GFX6-NEXT: s_or_b32 s14, s14, s15
-; GFX6-NEXT: s_cmp_lg_u32 s14, 0
-; GFX6-NEXT: s_subb_u32 s14, s19, 0
-; GFX6-NEXT: s_cmp_ge_u32 s14, s7
-; GFX6-NEXT: s_cselect_b32 s15, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s21, s6
-; GFX6-NEXT: s_cselect_b32 s19, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s14, s7
-; GFX6-NEXT: s_cselect_b32 s14, s19, s15
-; GFX6-NEXT: s_add_u32 s15, s17, 1
-; GFX6-NEXT: s_addc_u32 s19, s16, 0
-; GFX6-NEXT: s_add_u32 s21, s17, 2
-; GFX6-NEXT: s_addc_u32 s22, s16, 0
-; GFX6-NEXT: s_cmp_lg_u32 s14, 0
-; GFX6-NEXT: s_cselect_b32 s14, s21, s15
-; GFX6-NEXT: s_cselect_b32 s15, s22, s19
-; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_subb_u32 s9, s9, s18
+; GFX6-NEXT: s_subb_u32 s9, s9, s20
; GFX6-NEXT: s_cmp_ge_u32 s9, s7
-; GFX6-NEXT: s_cselect_b32 s18, -1, 0
+; GFX6-NEXT: s_cselect_b32 s14, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s8, s6
; GFX6-NEXT: s_cselect_b32 s6, -1, 0
; GFX6-NEXT: s_cmp_eq_u32 s9, s7
-; GFX6-NEXT: s_cselect_b32 s6, s6, s18
+; GFX6-NEXT: s_cselect_b32 s6, s6, s14
; GFX6-NEXT: s_cmp_lg_u32 s6, 0
-; GFX6-NEXT: s_cselect_b32 s7, s15, s16
-; GFX6-NEXT: s_cselect_b32 s6, s14, s17
+; GFX6-NEXT: s_cselect_b32 s7, s17, s19
+; GFX6-NEXT: s_cselect_b32 s6, s16, s18
; GFX6-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3]
; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GFX6-NEXT: s_sub_u32 s14, s6, s2
-; GFX6-NEXT: s_subb_u32 s15, s7, s3
+; GFX6-NEXT: s_sub_u32 s16, s6, s2
+; GFX6-NEXT: s_subb_u32 s17, s7, s3
; GFX6-NEXT: s_ashr_i32 s6, s1, 31
; GFX6-NEXT: s_add_u32 s0, s0, s6
; GFX6-NEXT: s_mov_b32 s7, s6
@@ -8464,40 +8439,39 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0
-; GFX6-NEXT: v_readfirstlane_b32 s16, v1
+; GFX6-NEXT: v_readfirstlane_b32 s14, v1
; GFX6-NEXT: v_readfirstlane_b32 s2, v0
-; GFX6-NEXT: s_mul_i32 s1, s12, s16
+; GFX6-NEXT: s_mul_i32 s1, s12, s14
; GFX6-NEXT: v_readfirstlane_b32 s3, v2
; GFX6-NEXT: s_mul_i32 s0, s13, s2
; GFX6-NEXT: s_add_i32 s1, s3, s1
; GFX6-NEXT: s_add_i32 s3, s1, s0
-; GFX6-NEXT: s_mul_i32 s17, s12, s2
+; GFX6-NEXT: s_mul_i32 s15, s12, s2
; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT: v_mul_hi_u32 v0, v0, s17
+; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX6-NEXT: s_mul_i32 s4, s2, s3
; GFX6-NEXT: v_readfirstlane_b32 s5, v2
; GFX6-NEXT: v_readfirstlane_b32 s18, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, v1, s17
+; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15
; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3
; GFX6-NEXT: s_add_u32 s4, s18, s4
; GFX6-NEXT: s_addc_u32 s5, 0, s5
-; GFX6-NEXT: s_mul_i32 s17, s16, s17
+; GFX6-NEXT: s_mul_i32 s15, s14, s15
; GFX6-NEXT: v_readfirstlane_b32 s18, v0
-; GFX6-NEXT: s_add_u32 s4, s4, s17
+; GFX6-NEXT: s_add_u32 s4, s4, s15
; GFX6-NEXT: s_addc_u32 s4, s5, s18
; GFX6-NEXT: v_readfirstlane_b32 s5, v1
; GFX6-NEXT: s_addc_u32 s5, s5, 0
-; GFX6-NEXT: s_mul_i32 s3, s16, s3
+; GFX6-NEXT: s_mul_i32 s3, s14, s3
; GFX6-NEXT: s_add_u32 s3, s4, s3
; GFX6-NEXT: s_addc_u32 s4, 0, s5
; GFX6-NEXT: s_add_u32 s5, s2, s3
; GFX6-NEXT: v_mov_b32_e32 v0, s5
-; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_cmp_lg_u32 s2, 0
-; GFX6-NEXT: s_addc_u32 s4, s16, s4
+; GFX6-NEXT: s_addc_u32 s4, s14, s4
; GFX6-NEXT: s_mul_i32 s2, s12, s4
; GFX6-NEXT: v_readfirstlane_b32 s3, v0
; GFX6-NEXT: s_add_i32 s2, s3, s2
@@ -8511,14 +8485,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0
; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX6-NEXT: s_mul_i32 s13, s5, s2
-; GFX6-NEXT: v_readfirstlane_b32 s17, v2
-; GFX6-NEXT: s_add_u32 s13, s17, s13
-; GFX6-NEXT: v_readfirstlane_b32 s16, v0
+; GFX6-NEXT: v_readfirstlane_b32 s15, v2
+; GFX6-NEXT: s_add_u32 s13, s15, s13
+; GFX6-NEXT: v_readfirstlane_b32 s14, v0
; GFX6-NEXT: s_mul_i32 s3, s4, s3
-; GFX6-NEXT: s_addc_u32 s16, 0, s16
+; GFX6-NEXT: s_addc_u32 s14, 0, s14
; GFX6-NEXT: v_readfirstlane_b32 s12, v3
; GFX6-NEXT: s_add_u32 s3, s13, s3
-; GFX6-NEXT: s_addc_u32 s3, s16, s12
+; GFX6-NEXT: s_addc_u32 s3, s14, s12
; GFX6-NEXT: v_readfirstlane_b32 s12, v1
; GFX6-NEXT: s_addc_u32 s12, s12, 0
; GFX6-NEXT: s_mul_i32 s2, s4, s2
@@ -8527,7 +8501,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_add_u32 s13, s5, s2
; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_cmp_lg_u32 s2, 0
; GFX6-NEXT: s_addc_u32 s12, s4, s12
; GFX6-NEXT: s_ashr_i32 s4, s11, 31
; GFX6-NEXT: s_add_u32 s2, s10, s4
@@ -8539,72 +8512,70 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mov_b32_e32 v2, s13
; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2
; GFX6-NEXT: s_mul_i32 s2, s10, s12
-; GFX6-NEXT: v_readfirstlane_b32 s16, v1
+; GFX6-NEXT: v_readfirstlane_b32 s14, v1
; GFX6-NEXT: v_mul_hi_u32 v1, s11, v2
-; GFX6-NEXT: v_readfirstlane_b32 s17, v3
+; GFX6-NEXT: v_readfirstlane_b32 s15, v3
; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0
-; GFX6-NEXT: s_add_u32 s2, s17, s2
-; GFX6-NEXT: s_addc_u32 s16, 0, s16
+; GFX6-NEXT: s_add_u32 s2, s15, s2
+; GFX6-NEXT: s_addc_u32 s14, 0, s14
; GFX6-NEXT: s_mul_i32 s13, s11, s13
-; GFX6-NEXT: v_readfirstlane_b32 s17, v1
+; GFX6-NEXT: v_readfirstlane_b32 s15, v1
; GFX6-NEXT: s_add_u32 s2, s2, s13
-; GFX6-NEXT: s_addc_u32 s2, s16, s17
+; GFX6-NEXT: s_addc_u32 s2, s14, s15
; GFX6-NEXT: v_readfirstlane_b32 s13, v0
; GFX6-NEXT: s_addc_u32 s13, s13, 0
; GFX6-NEXT: s_mul_i32 s12, s11, s12
-; GFX6-NEXT: s_add_u32 s16, s2, s12
-; GFX6-NEXT: v_mov_b32_e32 v0, s16
+; GFX6-NEXT: s_add_u32 s18, s2, s12
+; GFX6-NEXT: v_mov_b32_e32 v0, s18
; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
-; GFX6-NEXT: s_addc_u32 s17, 0, s13
-; GFX6-NEXT: s_mul_i32 s12, s8, s17
+; GFX6-NEXT: s_addc_u32 s19, 0, s13
+; GFX6-NEXT: s_mul_i32 s12, s8, s19
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: v_readfirstlane_b32 s13, v0
; GFX6-NEXT: s_add_i32 s12, s13, s12
-; GFX6-NEXT: s_mul_i32 s13, s9, s16
-; GFX6-NEXT: s_add_i32 s18, s12, s13
-; GFX6-NEXT: s_sub_i32 s19, s11, s18
-; GFX6-NEXT: s_mul_i32 s12, s8, s16
+; GFX6-NEXT: s_mul_i32 s13, s9, s18
+; GFX6-NEXT: s_add_i32 s20, s12, s13
+; GFX6-NEXT: s_sub_i32 s14, s11, s20
+; GFX6-NEXT: s_mul_i32 s12, s8, s18
; GFX6-NEXT: s_sub_u32 s10, s10, s12
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s20, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_subb_u32 s19, s19, s9
-; GFX6-NEXT: s_sub_u32 s21, s10, s8
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: s_or_b32 s15, s12, s13
+; GFX6-NEXT: s_subb_u32 s21, s14, s9
+; GFX6-NEXT: s_sub_u32 s22, s10, s8
+; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT: s_or_b32 s14, s14, s15
+; GFX6-NEXT: s_subb_u32 s14, s21, 0
+; GFX6-NEXT: s_cmp_ge_u32 s14, s9
+; GFX6-NEXT: s_cselect_b32 s15, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s22, s8
+; GFX6-NEXT: s_cselect_b32 s21, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s14, s9
+; GFX6-NEXT: s_cselect_b32 s14, s21, s15
+; GFX6-NEXT: s_add_u32 s15, s18, 1
+; GFX6-NEXT: s_addc_u32 s21, s19, 0
+; GFX6-NEXT: s_add_u32 s22, s18, 2
+; GFX6-NEXT: s_addc_u32 s23, s19, 0
+; GFX6-NEXT: s_cmp_lg_u32 s14, 0
+; GFX6-NEXT: s_cselect_b32 s14, s22, s15
+; GFX6-NEXT: s_cselect_b32 s15, s23, s21
; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_subb_u32 s12, s19, 0
-; GFX6-NEXT: s_cmp_ge_u32 s12, s9
-; GFX6-NEXT: s_cselect_b32 s13, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s21, s8
-; GFX6-NEXT: s_cselect_b32 s19, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s12, s9
-; GFX6-NEXT: s_cselect_b32 s12, s19, s13
-; GFX6-NEXT: s_add_u32 s13, s16, 1
-; GFX6-NEXT: s_addc_u32 s19, s17, 0
-; GFX6-NEXT: s_add_u32 s21, s16, 2
-; GFX6-NEXT: s_addc_u32 s22, s17, 0
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_cselect_b32 s12, s21, s13
-; GFX6-NEXT: s_cselect_b32 s13, s22, s19
-; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_subb_u32 s11, s11, s18
+; GFX6-NEXT: s_subb_u32 s11, s11, s20
; GFX6-NEXT: s_cmp_ge_u32 s11, s9
-; GFX6-NEXT: s_cselect_b32 s18, -1, 0
+; GFX6-NEXT: s_cselect_b32 s12, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s10, s8
; GFX6-NEXT: s_cselect_b32 s8, -1, 0
; GFX6-NEXT: s_cmp_eq_u32 s11, s9
-; GFX6-NEXT: s_cselect_b32 s8, s8, s18
+; GFX6-NEXT: s_cselect_b32 s8, s8, s12
; GFX6-NEXT: s_cmp_lg_u32 s8, 0
-; GFX6-NEXT: s_cselect_b32 s9, s13, s17
-; GFX6-NEXT: s_cselect_b32 s8, s12, s16
+; GFX6-NEXT: s_cselect_b32 s9, s15, s19
+; GFX6-NEXT: s_cselect_b32 s8, s14, s18
; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GFX6-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5]
; GFX6-NEXT: s_sub_u32 s4, s6, s4
; GFX6-NEXT: s_subb_u32 s5, s7, s5
; GFX6-NEXT: s_mov_b32 s2, -1
-; GFX6-NEXT: v_mov_b32_e32 v0, s14
-; GFX6-NEXT: v_mov_b32_e32 v1, s15
+; GFX6-NEXT: v_mov_b32_e32 v0, s16
+; GFX6-NEXT: v_mov_b32_e32 v1, s17
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: v_mov_b32_e32 v3, s5
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
@@ -8624,8 +8595,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3]
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT: s_sub_u32 s14, 0, s6
-; GFX9-NEXT: s_subb_u32 s15, 0, s7
+; GFX9-NEXT: s_sub_u32 s12, 0, s6
+; GFX9-NEXT: s_subb_u32 s13, 0, s7
; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX9-NEXT: v_rcp_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -8634,56 +8605,52 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s16, v1
-; GFX9-NEXT: v_readfirstlane_b32 s12, v0
-; GFX9-NEXT: s_mul_i32 s13, s14, s16
-; GFX9-NEXT: s_mul_hi_u32 s18, s14, s12
-; GFX9-NEXT: s_mul_i32 s17, s15, s12
-; GFX9-NEXT: s_add_i32 s13, s18, s13
-; GFX9-NEXT: s_mul_i32 s19, s14, s12
-; GFX9-NEXT: s_add_i32 s13, s13, s17
-; GFX9-NEXT: s_mul_hi_u32 s18, s12, s19
-; GFX9-NEXT: s_mul_i32 s20, s12, s13
-; GFX9-NEXT: s_mul_hi_u32 s17, s12, s13
+; GFX9-NEXT: v_readfirstlane_b32 s14, v1
+; GFX9-NEXT: v_readfirstlane_b32 s15, v0
+; GFX9-NEXT: s_mul_i32 s16, s12, s14
+; GFX9-NEXT: s_mul_hi_u32 s18, s12, s15
+; GFX9-NEXT: s_mul_i32 s17, s13, s15
+; GFX9-NEXT: s_add_i32 s16, s18, s16
+; GFX9-NEXT: s_mul_i32 s19, s12, s15
+; GFX9-NEXT: s_add_i32 s16, s16, s17
+; GFX9-NEXT: s_mul_hi_u32 s18, s15, s19
+; GFX9-NEXT: s_mul_i32 s20, s15, s16
+; GFX9-NEXT: s_mul_hi_u32 s17, s15, s16
; GFX9-NEXT: s_add_u32 s18, s18, s20
; GFX9-NEXT: s_addc_u32 s17, 0, s17
-; GFX9-NEXT: s_mul_hi_u32 s20, s16, s19
-; GFX9-NEXT: s_mul_i32 s19, s16, s19
+; GFX9-NEXT: s_mul_hi_u32 s20, s14, s19
+; GFX9-NEXT: s_mul_i32 s19, s14, s19
; GFX9-NEXT: s_add_u32 s18, s18, s19
-; GFX9-NEXT: s_mul_hi_u32 s21, s16, s13
+; GFX9-NEXT: s_mul_hi_u32 s21, s14, s16
; GFX9-NEXT: s_addc_u32 s17, s17, s20
; GFX9-NEXT: s_addc_u32 s18, s21, 0
-; GFX9-NEXT: s_mul_i32 s13, s16, s13
-; GFX9-NEXT: s_add_u32 s13, s17, s13
+; GFX9-NEXT: s_mul_i32 s16, s14, s16
+; GFX9-NEXT: s_add_u32 s16, s17, s16
; GFX9-NEXT: s_addc_u32 s17, 0, s18
-; GFX9-NEXT: s_add_u32 s18, s12, s13
-; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT: s_addc_u32 s16, s16, s17
-; GFX9-NEXT: s_mul_i32 s12, s14, s16
-; GFX9-NEXT: s_mul_hi_u32 s13, s14, s18
-; GFX9-NEXT: s_add_i32 s12, s13, s12
-; GFX9-NEXT: s_mul_i32 s15, s15, s18
-; GFX9-NEXT: s_add_i32 s12, s12, s15
-; GFX9-NEXT: s_mul_i32 s14, s14, s18
-; GFX9-NEXT: s_mul_hi_u32 s15, s16, s14
-; GFX9-NEXT: s_mul_i32 s17, s16, s14
-; GFX9-NEXT: s_mul_i32 s20, s18, s12
-; GFX9-NEXT: s_mul_hi_u32 s14, s18, s14
-; GFX9-NEXT: s_mul_hi_u32 s19, s18, s12
-; GFX9-NEXT: s_add_u32 s14, s14, s20
+; GFX9-NEXT: s_add_u32 s15, s15, s16
+; GFX9-NEXT: s_addc_u32 s14, s14, s17
+; GFX9-NEXT: s_mul_i32 s16, s12, s14
+; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15
+; GFX9-NEXT: s_add_i32 s16, s17, s16
+; GFX9-NEXT: s_mul_i32 s13, s13, s15
+; GFX9-NEXT: s_add_i32 s16, s16, s13
+; GFX9-NEXT: s_mul_i32 s12, s12, s15
+; GFX9-NEXT: s_mul_hi_u32 s17, s14, s12
+; GFX9-NEXT: s_mul_i32 s18, s14, s12
+; GFX9-NEXT: s_mul_i32 s20, s15, s16
+; GFX9-NEXT: s_mul_hi_u32 s12, s15, s12
+; GFX9-NEXT: s_mul_hi_u32 s19, s15, s16
+; GFX9-NEXT: s_add_u32 s12, s12, s20
; GFX9-NEXT: s_addc_u32 s19, 0, s19
-; GFX9-NEXT: s_add_u32 s14, s14, s17
-; GFX9-NEXT: s_mul_hi_u32 s13, s16, s12
-; GFX9-NEXT: s_addc_u32 s14, s19, s15
+; GFX9-NEXT: s_add_u32 s12, s12, s18
+; GFX9-NEXT: s_mul_hi_u32 s13, s14, s16
+; GFX9-NEXT: s_addc_u32 s12, s19, s17
; GFX9-NEXT: s_addc_u32 s13, s13, 0
-; GFX9-NEXT: s_mul_i32 s12, s16, s12
-; GFX9-NEXT: s_add_u32 s12, s14, s12
-; GFX9-NEXT: s_addc_u32 s14, 0, s13
-; GFX9-NEXT: s_add_u32 s15, s18, s12
-; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT: s_addc_u32 s14, s16, s14
+; GFX9-NEXT: s_mul_i32 s16, s14, s16
+; GFX9-NEXT: s_add_u32 s12, s12, s16
+; GFX9-NEXT: s_addc_u32 s13, 0, s13
+; GFX9-NEXT: s_add_u32 s15, s15, s12
+; GFX9-NEXT: s_addc_u32 s14, s14, s13
; GFX9-NEXT: s_ashr_i32 s12, s9, 31
; GFX9-NEXT: s_add_u32 s8, s8, s12
; GFX9-NEXT: s_mov_b32 s13, s12
@@ -8701,38 +8668,35 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_addc_u32 s15, s16, s19
; GFX9-NEXT: s_addc_u32 s16, s18, 0
; GFX9-NEXT: s_mul_i32 s14, s9, s14
-; GFX9-NEXT: s_add_u32 s18, s15, s14
-; GFX9-NEXT: s_addc_u32 s19, 0, s16
-; GFX9-NEXT: s_mul_i32 s14, s6, s19
-; GFX9-NEXT: s_mul_hi_u32 s15, s6, s18
+; GFX9-NEXT: s_add_u32 s17, s15, s14
+; GFX9-NEXT: s_addc_u32 s16, 0, s16
+; GFX9-NEXT: s_mul_i32 s14, s6, s16
+; GFX9-NEXT: s_mul_hi_u32 s15, s6, s17
; GFX9-NEXT: s_add_i32 s14, s15, s14
-; GFX9-NEXT: s_mul_i32 s15, s7, s18
-; GFX9-NEXT: s_add_i32 s20, s14, s15
-; GFX9-NEXT: s_sub_i32 s16, s9, s20
-; GFX9-NEXT: s_mul_i32 s14, s6, s18
+; GFX9-NEXT: s_mul_i32 s15, s7, s17
+; GFX9-NEXT: s_add_i32 s18, s14, s15
+; GFX9-NEXT: s_sub_i32 s19, s9, s18
+; GFX9-NEXT: s_mul_i32 s14, s6, s17
; GFX9-NEXT: s_sub_u32 s8, s8, s14
; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT: s_subb_u32 s21, s16, s7
-; GFX9-NEXT: s_sub_u32 s22, s8, s6
-; GFX9-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[16:17], 0
-; GFX9-NEXT: s_subb_u32 s16, s21, 0
-; GFX9-NEXT: s_cmp_ge_u32 s16, s7
-; GFX9-NEXT: s_cselect_b32 s17, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s22, s6
+; GFX9-NEXT: s_subb_u32 s19, s19, s7
+; GFX9-NEXT: s_sub_u32 s20, s8, s6
+; GFX9-NEXT: s_subb_u32 s19, s19, 0
+; GFX9-NEXT: s_cmp_ge_u32 s19, s7
; GFX9-NEXT: s_cselect_b32 s21, -1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s16, s7
-; GFX9-NEXT: s_cselect_b32 s16, s21, s17
-; GFX9-NEXT: s_add_u32 s17, s18, 1
-; GFX9-NEXT: s_addc_u32 s21, s19, 0
-; GFX9-NEXT: s_add_u32 s22, s18, 2
-; GFX9-NEXT: s_addc_u32 s23, s19, 0
-; GFX9-NEXT: s_cmp_lg_u32 s16, 0
-; GFX9-NEXT: s_cselect_b32 s16, s22, s17
-; GFX9-NEXT: s_cselect_b32 s17, s23, s21
+; GFX9-NEXT: s_cmp_ge_u32 s20, s6
+; GFX9-NEXT: s_cselect_b32 s20, -1, 0
+; GFX9-NEXT: s_cmp_eq_u32 s19, s7
+; GFX9-NEXT: s_cselect_b32 s19, s20, s21
+; GFX9-NEXT: s_add_u32 s20, s17, 1
+; GFX9-NEXT: s_addc_u32 s21, s16, 0
+; GFX9-NEXT: s_add_u32 s22, s17, 2
+; GFX9-NEXT: s_addc_u32 s23, s16, 0
+; GFX9-NEXT: s_cmp_lg_u32 s19, 0
+; GFX9-NEXT: s_cselect_b32 s19, s22, s20
+; GFX9-NEXT: s_cselect_b32 s20, s23, s21
; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT: s_subb_u32 s9, s9, s20
+; GFX9-NEXT: s_subb_u32 s9, s9, s18
; GFX9-NEXT: s_cmp_ge_u32 s9, s7
; GFX9-NEXT: s_cselect_b32 s14, -1, 0
; GFX9-NEXT: s_cmp_ge_u32 s8, s6
@@ -8740,12 +8704,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cmp_eq_u32 s9, s7
; GFX9-NEXT: s_cselect_b32 s6, s6, s14
; GFX9-NEXT: s_cmp_lg_u32 s6, 0
-; GFX9-NEXT: s_cselect_b32 s7, s17, s19
-; GFX9-NEXT: s_cselect_b32 s6, s16, s18
+; GFX9-NEXT: s_cselect_b32 s7, s20, s16
+; GFX9-NEXT: s_cselect_b32 s6, s19, s17
; GFX9-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3]
; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GFX9-NEXT: s_sub_u32 s14, s6, s2
-; GFX9-NEXT: s_subb_u32 s15, s7, s3
+; GFX9-NEXT: s_sub_u32 s12, s6, s2
+; GFX9-NEXT: s_subb_u32 s13, s7, s3
; GFX9-NEXT: s_ashr_i32 s2, s1, 31
; GFX9-NEXT: s_add_u32 s0, s0, s2
; GFX9-NEXT: s_mov_b32 s3, s2
@@ -8754,8 +8718,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT: s_sub_u32 s8, 0, s6
-; GFX9-NEXT: s_subb_u32 s9, 0, s7
+; GFX9-NEXT: s_sub_u32 s4, 0, s6
+; GFX9-NEXT: s_subb_u32 s5, 0, s7
; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -8765,105 +8729,98 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: v_readfirstlane_b32 s13, v2
-; GFX9-NEXT: s_mul_hi_u32 s12, s8, s4
-; GFX9-NEXT: s_mul_i32 s16, s8, s13
-; GFX9-NEXT: s_mul_i32 s5, s9, s4
-; GFX9-NEXT: s_add_i32 s12, s12, s16
-; GFX9-NEXT: s_add_i32 s12, s12, s5
-; GFX9-NEXT: s_mul_i32 s17, s8, s4
-; GFX9-NEXT: s_mul_i32 s16, s4, s12
-; GFX9-NEXT: s_mul_hi_u32 s18, s4, s17
-; GFX9-NEXT: s_mul_hi_u32 s5, s4, s12
+; GFX9-NEXT: v_readfirstlane_b32 s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s15, v2
+; GFX9-NEXT: s_mul_hi_u32 s14, s4, s8
+; GFX9-NEXT: s_mul_i32 s16, s4, s15
+; GFX9-NEXT: s_mul_i32 s9, s5, s8
+; GFX9-NEXT: s_add_i32 s14, s14, s16
+; GFX9-NEXT: s_add_i32 s14, s14, s9
+; GFX9-NEXT: s_mul_i32 s17, s4, s8
+; GFX9-NEXT: s_mul_i32 s16, s8, s14
+; GFX9-NEXT: s_mul_hi_u32 s18, s8, s17
+; GFX9-NEXT: s_mul_hi_u32 s9, s8, s14
; GFX9-NEXT: s_add_u32 s16, s18, s16
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_mul_hi_u32 s19, s13, s17
-; GFX9-NEXT: s_mul_i32 s17, s13, s17
+; GFX9-NEXT: s_addc_u32 s9, 0, s9
+; GFX9-NEXT: s_mul_hi_u32 s19, s15, s17
+; GFX9-NEXT: s_mul_i32 s17, s15, s17
; GFX9-NEXT: s_add_u32 s16, s16, s17
-; GFX9-NEXT: s_mul_hi_u32 s18, s13, s12
-; GFX9-NEXT: s_addc_u32 s5, s5, s19
+; GFX9-NEXT: s_mul_hi_u32 s18, s15, s14
+; GFX9-NEXT: s_addc_u32 s9, s9, s19
; GFX9-NEXT: s_addc_u32 s16, s18, 0
-; GFX9-NEXT: s_mul_i32 s12, s13, s12
-; GFX9-NEXT: s_add_u32 s5, s5, s12
-; GFX9-NEXT: s_addc_u32 s12, 0, s16
-; GFX9-NEXT: s_add_u32 s16, s4, s5
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s12, s13, s12
-; GFX9-NEXT: s_mul_i32 s4, s8, s12
-; GFX9-NEXT: s_mul_hi_u32 s5, s8, s16
-; GFX9-NEXT: s_add_i32 s4, s5, s4
-; GFX9-NEXT: s_mul_i32 s9, s9, s16
-; GFX9-NEXT: s_add_i32 s4, s4, s9
-; GFX9-NEXT: s_mul_i32 s8, s8, s16
-; GFX9-NEXT: s_mul_hi_u32 s9, s12, s8
-; GFX9-NEXT: s_mul_i32 s13, s12, s8
-; GFX9-NEXT: s_mul_i32 s18, s16, s4
-; GFX9-NEXT: s_mul_hi_u32 s8, s16, s8
-; GFX9-NEXT: s_mul_hi_u32 s17, s16, s4
-; GFX9-NEXT: s_add_u32 s8, s8, s18
+; GFX9-NEXT: s_mul_i32 s14, s15, s14
+; GFX9-NEXT: s_add_u32 s9, s9, s14
+; GFX9-NEXT: s_addc_u32 s14, 0, s16
+; GFX9-NEXT: s_add_u32 s8, s8, s9
+; GFX9-NEXT: s_addc_u32 s9, s15, s14
+; GFX9-NEXT: s_mul_i32 s14, s4, s9
+; GFX9-NEXT: s_mul_hi_u32 s15, s4, s8
+; GFX9-NEXT: s_add_i32 s14, s15, s14
+; GFX9-NEXT: s_mul_i32 s5, s5, s8
+; GFX9-NEXT: s_add_i32 s14, s14, s5
+; GFX9-NEXT: s_mul_i32 s4, s4, s8
+; GFX9-NEXT: s_mul_hi_u32 s15, s9, s4
+; GFX9-NEXT: s_mul_i32 s16, s9, s4
+; GFX9-NEXT: s_mul_i32 s18, s8, s14
+; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4
+; GFX9-NEXT: s_mul_hi_u32 s17, s8, s14
+; GFX9-NEXT: s_add_u32 s4, s4, s18
; GFX9-NEXT: s_addc_u32 s17, 0, s17
-; GFX9-NEXT: s_add_u32 s8, s8, s13
-; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4
-; GFX9-NEXT: s_addc_u32 s8, s17, s9
+; GFX9-NEXT: s_add_u32 s4, s4, s16
+; GFX9-NEXT: s_mul_hi_u32 s5, s9, s14
+; GFX9-NEXT: s_addc_u32 s4, s17, s15
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_mul_i32 s4, s12, s4
-; GFX9-NEXT: s_add_u32 s4, s8, s4
-; GFX9-NEXT: s_addc_u32 s8, 0, s5
-; GFX9-NEXT: s_add_u32 s13, s16, s4
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s12, s12, s8
+; GFX9-NEXT: s_mul_i32 s14, s9, s14
+; GFX9-NEXT: s_add_u32 s4, s4, s14
+; GFX9-NEXT: s_addc_u32 s5, 0, s5
+; GFX9-NEXT: s_add_u32 s14, s8, s4
+; GFX9-NEXT: s_addc_u32 s15, s9, s5
; GFX9-NEXT: s_ashr_i32 s4, s11, 31
; GFX9-NEXT: s_add_u32 s8, s10, s4
; GFX9-NEXT: s_mov_b32 s5, s4
; GFX9-NEXT: s_addc_u32 s9, s11, s4
; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5]
-; GFX9-NEXT: s_mul_i32 s11, s8, s12
-; GFX9-NEXT: s_mul_hi_u32 s16, s8, s13
-; GFX9-NEXT: s_mul_hi_u32 s10, s8, s12
+; GFX9-NEXT: s_mul_i32 s11, s8, s15
+; GFX9-NEXT: s_mul_hi_u32 s16, s8, s14
+; GFX9-NEXT: s_mul_hi_u32 s10, s8, s15
; GFX9-NEXT: s_add_u32 s11, s16, s11
; GFX9-NEXT: s_addc_u32 s10, 0, s10
-; GFX9-NEXT: s_mul_hi_u32 s17, s9, s13
-; GFX9-NEXT: s_mul_i32 s13, s9, s13
-; GFX9-NEXT: s_add_u32 s11, s11, s13
-; GFX9-NEXT: s_mul_hi_u32 s16, s9, s12
+; GFX9-NEXT: s_mul_hi_u32 s17, s9, s14
+; GFX9-NEXT: s_mul_i32 s14, s9, s14
+; GFX9-NEXT: s_add_u32 s11, s11, s14
+; GFX9-NEXT: s_mul_hi_u32 s16, s9, s15
; GFX9-NEXT: s_addc_u32 s10, s10, s17
; GFX9-NEXT: s_addc_u32 s11, s16, 0
-; GFX9-NEXT: s_mul_i32 s12, s9, s12
-; GFX9-NEXT: s_add_u32 s16, s10, s12
-; GFX9-NEXT: s_addc_u32 s17, 0, s11
-; GFX9-NEXT: s_mul_i32 s10, s6, s17
-; GFX9-NEXT: s_mul_hi_u32 s11, s6, s16
+; GFX9-NEXT: s_mul_i32 s14, s9, s15
+; GFX9-NEXT: s_add_u32 s14, s10, s14
+; GFX9-NEXT: s_addc_u32 s15, 0, s11
+; GFX9-NEXT: s_mul_i32 s10, s6, s15
+; GFX9-NEXT: s_mul_hi_u32 s11, s6, s14
; GFX9-NEXT: s_add_i32 s10, s11, s10
-; GFX9-NEXT: s_mul_i32 s11, s7, s16
-; GFX9-NEXT: s_add_i32 s18, s10, s11
-; GFX9-NEXT: s_sub_i32 s12, s9, s18
-; GFX9-NEXT: s_mul_i32 s10, s6, s16
+; GFX9-NEXT: s_mul_i32 s11, s7, s14
+; GFX9-NEXT: s_add_i32 s16, s10, s11
+; GFX9-NEXT: s_sub_i32 s17, s9, s16
+; GFX9-NEXT: s_mul_i32 s10, s6, s14
; GFX9-NEXT: s_sub_u32 s8, s8, s10
; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s19, s12, s7
-; GFX9-NEXT: s_sub_u32 s20, s8, s6
-; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT: s_subb_u32 s12, s19, 0
-; GFX9-NEXT: s_cmp_ge_u32 s12, s7
-; GFX9-NEXT: s_cselect_b32 s13, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s20, s6
+; GFX9-NEXT: s_subb_u32 s17, s17, s7
+; GFX9-NEXT: s_sub_u32 s18, s8, s6
+; GFX9-NEXT: s_subb_u32 s17, s17, 0
+; GFX9-NEXT: s_cmp_ge_u32 s17, s7
; GFX9-NEXT: s_cselect_b32 s19, -1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s12, s7
-; GFX9-NEXT: s_cselect_b32 s12, s19, s13
-; GFX9-NEXT: s_add_u32 s13, s16, 1
-; GFX9-NEXT: s_addc_u32 s19, s17, 0
-; GFX9-NEXT: s_add_u32 s20, s16, 2
-; GFX9-NEXT: s_addc_u32 s21, s17, 0
-; GFX9-NEXT: s_cmp_lg_u32 s12, 0
-; GFX9-NEXT: s_cselect_b32 s12, s20, s13
-; GFX9-NEXT: s_cselect_b32 s13, s21, s19
+; GFX9-NEXT: s_cmp_ge_u32 s18, s6
+; GFX9-NEXT: s_cselect_b32 s18, -1, 0
+; GFX9-NEXT: s_cmp_eq_u32 s17, s7
+; GFX9-NEXT: s_cselect_b32 s17, s18, s19
+; GFX9-NEXT: s_add_u32 s18, s14, 1
+; GFX9-NEXT: s_addc_u32 s19, s15, 0
+; GFX9-NEXT: s_add_u32 s20, s14, 2
+; GFX9-NEXT: s_addc_u32 s21, s15, 0
+; GFX9-NEXT: s_cmp_lg_u32 s17, 0
+; GFX9-NEXT: s_cselect_b32 s17, s20, s18
+; GFX9-NEXT: s_cselect_b32 s18, s21, s19
; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s9, s9, s18
+; GFX9-NEXT: s_subb_u32 s9, s9, s16
; GFX9-NEXT: s_cmp_ge_u32 s9, s7
; GFX9-NEXT: s_cselect_b32 s10, -1, 0
; GFX9-NEXT: s_cmp_ge_u32 s8, s6
@@ -8871,14 +8828,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cmp_eq_u32 s9, s7
; GFX9-NEXT: s_cselect_b32 s6, s6, s10
; GFX9-NEXT: s_cmp_lg_u32 s6, 0
-; GFX9-NEXT: s_cselect_b32 s7, s13, s17
-; GFX9-NEXT: s_cselect_b32 s6, s12, s16
+; GFX9-NEXT: s_cselect_b32 s7, s18, s15
+; GFX9-NEXT: s_cselect_b32 s6, s17, s14
; GFX9-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3]
; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], s[2:3]
; GFX9-NEXT: s_sub_u32 s2, s4, s2
; GFX9-NEXT: s_subb_u32 s3, s5, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s14
-; GFX9-NEXT: v_mov_b32_e32 v2, s15
+; GFX9-NEXT: v_mov_b32_e32 v1, s12
+; GFX9-NEXT: v_mov_b32_e32 v2, s13
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -9099,10 +9056,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_addc_u32 s13, 0, s14
; GFX6-NEXT: s_add_u32 s14, s0, s1
; GFX6-NEXT: v_mov_b32_e32 v0, s14
-; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0
+; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_cmp_lg_u32 s0, 0
; GFX6-NEXT: s_addc_u32 s12, s12, s13
; GFX6-NEXT: s_mul_i32 s0, s10, s12
; GFX6-NEXT: v_readfirstlane_b32 s1, v0
@@ -9133,7 +9089,6 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_add_u32 s13, s14, s0
; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_cmp_lg_u32 s0, 0
; GFX6-NEXT: s_addc_u32 s12, s12, s10
; GFX6-NEXT: s_ashr_i32 s10, s7, 31
; GFX6-NEXT: s_add_u32 s0, s6, s10
@@ -9168,46 +9123,43 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: v_readfirstlane_b32 s5, v0
; GFX6-NEXT: s_add_i32 s4, s5, s4
; GFX6-NEXT: s_mul_i32 s5, s9, s12
-; GFX6-NEXT: s_add_i32 s13, s4, s5
-; GFX6-NEXT: s_sub_i32 s14, s7, s13
+; GFX6-NEXT: s_add_i32 s14, s4, s5
+; GFX6-NEXT: s_sub_i32 s13, s7, s14
; GFX6-NEXT: s_mul_i32 s4, s8, s12
; GFX6-NEXT: s_sub_u32 s6, s6, s4
; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX6-NEXT: s_or_b32 s12, s4, s5
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_subb_u32 s14, s14, s9
-; GFX6-NEXT: s_sub_u32 s15, s6, s8
-; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX6-NEXT: s_subb_u32 s15, s13, s9
+; GFX6-NEXT: s_sub_u32 s16, s6, s8
+; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: s_or_b32 s17, s12, s13
+; GFX6-NEXT: s_subb_u32 s17, s15, 0
+; GFX6-NEXT: s_cmp_ge_u32 s17, s9
+; GFX6-NEXT: s_cselect_b32 s18, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s16, s8
+; GFX6-NEXT: s_cselect_b32 s19, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s17, s9
+; GFX6-NEXT: s_cselect_b32 s18, s19, s18
+; GFX6-NEXT: s_or_b32 s12, s12, s13
+; GFX6-NEXT: s_subb_u32 s15, s15, s9
+; GFX6-NEXT: s_sub_u32 s19, s16, s8
+; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: s_or_b32 s12, s12, s13
+; GFX6-NEXT: s_subb_u32 s12, s15, 0
+; GFX6-NEXT: s_cmp_lg_u32 s18, 0
+; GFX6-NEXT: s_cselect_b32 s13, s19, s16
+; GFX6-NEXT: s_cselect_b32 s12, s12, s17
; GFX6-NEXT: s_or_b32 s4, s4, s5
-; GFX6-NEXT: s_cmp_lg_u32 s4, 0
-; GFX6-NEXT: s_subb_u32 s16, s14, 0
-; GFX6-NEXT: s_cmp_ge_u32 s16, s9
+; GFX6-NEXT: s_subb_u32 s4, s7, s14
+; GFX6-NEXT: s_cmp_ge_u32 s4, s9
; GFX6-NEXT: s_cselect_b32 s5, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s15, s8
-; GFX6-NEXT: s_cselect_b32 s17, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s16, s9
-; GFX6-NEXT: s_cselect_b32 s17, s17, s5
-; GFX6-NEXT: s_cmp_lg_u32 s4, 0
-; GFX6-NEXT: s_subb_u32 s14, s14, s9
-; GFX6-NEXT: s_sub_u32 s18, s15, s8
-; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT: s_or_b32 s4, s4, s5
-; GFX6-NEXT: s_cmp_lg_u32 s4, 0
-; GFX6-NEXT: s_subb_u32 s4, s14, 0
-; GFX6-NEXT: s_cmp_lg_u32 s17, 0
-; GFX6-NEXT: s_cselect_b32 s14, s18, s15
-; GFX6-NEXT: s_cselect_b32 s4, s4, s16
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_subb_u32 s5, s7, s13
-; GFX6-NEXT: s_cmp_ge_u32 s5, s9
-; GFX6-NEXT: s_cselect_b32 s7, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s6, s8
-; GFX6-NEXT: s_cselect_b32 s8, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s5, s9
-; GFX6-NEXT: s_cselect_b32 s7, s8, s7
-; GFX6-NEXT: s_cmp_lg_u32 s7, 0
-; GFX6-NEXT: s_cselect_b32 s5, s4, s5
-; GFX6-NEXT: s_cselect_b32 s4, s14, s6
+; GFX6-NEXT: s_cselect_b32 s7, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s4, s9
+; GFX6-NEXT: s_cselect_b32 s5, s7, s5
+; GFX6-NEXT: s_cmp_lg_u32 s5, 0
+; GFX6-NEXT: s_cselect_b32 s5, s12, s4
+; GFX6-NEXT: s_cselect_b32 s4, s13, s6
; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11]
; GFX6-NEXT: s_sub_u32 s4, s4, s10
; GFX6-NEXT: s_subb_u32 s5, s5, s10
@@ -9229,8 +9181,8 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: s_sub_u32 s8, 0, s6
-; GFX9-NEXT: s_subb_u32 s9, 0, s7
+; GFX9-NEXT: s_sub_u32 s4, 0, s6
+; GFX9-NEXT: s_subb_u32 s5, 0, s7
; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -9240,56 +9192,52 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s10, v2
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: s_mul_i32 s5, s8, s10
-; GFX9-NEXT: s_mul_hi_u32 s12, s8, s4
-; GFX9-NEXT: s_mul_i32 s11, s9, s4
-; GFX9-NEXT: s_add_i32 s5, s12, s5
-; GFX9-NEXT: s_mul_i32 s13, s8, s4
-; GFX9-NEXT: s_add_i32 s5, s5, s11
-; GFX9-NEXT: s_mul_hi_u32 s12, s4, s13
-; GFX9-NEXT: s_mul_i32 s14, s4, s5
-; GFX9-NEXT: s_mul_hi_u32 s11, s4, s5
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: v_readfirstlane_b32 s9, v1
+; GFX9-NEXT: s_mul_i32 s10, s4, s8
+; GFX9-NEXT: s_mul_hi_u32 s12, s4, s9
+; GFX9-NEXT: s_mul_i32 s11, s5, s9
+; GFX9-NEXT: s_add_i32 s10, s12, s10
+; GFX9-NEXT: s_mul_i32 s13, s4, s9
+; GFX9-NEXT: s_add_i32 s10, s10, s11
+; GFX9-NEXT: s_mul_hi_u32 s12, s9, s13
+; GFX9-NEXT: s_mul_i32 s14, s9, s10
+; GFX9-NEXT: s_mul_hi_u32 s11, s9, s10
; GFX9-NEXT: s_add_u32 s12, s12, s14
; GFX9-NEXT: s_addc_u32 s11, 0, s11
-; GFX9-NEXT: s_mul_hi_u32 s15, s10, s13
-; GFX9-NEXT: s_mul_i32 s13, s10, s13
+; GFX9-NEXT: s_mul_hi_u32 s15, s8, s13
+; GFX9-NEXT: s_mul_i32 s13, s8, s13
; GFX9-NEXT: s_add_u32 s12, s12, s13
-; GFX9-NEXT: s_mul_hi_u32 s14, s10, s5
+; GFX9-NEXT: s_mul_hi_u32 s14, s8, s10
; GFX9-NEXT: s_addc_u32 s11, s11, s15
; GFX9-NEXT: s_addc_u32 s12, s14, 0
-; GFX9-NEXT: s_mul_i32 s5, s10, s5
-; GFX9-NEXT: s_add_u32 s5, s11, s5
+; GFX9-NEXT: s_mul_i32 s10, s8, s10
+; GFX9-NEXT: s_add_u32 s10, s11, s10
; GFX9-NEXT: s_addc_u32 s11, 0, s12
-; GFX9-NEXT: s_add_u32 s12, s4, s5
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s10, s10, s11
-; GFX9-NEXT: s_mul_i32 s4, s8, s10
-; GFX9-NEXT: s_mul_hi_u32 s5, s8, s12
-; GFX9-NEXT: s_add_i32 s4, s5, s4
-; GFX9-NEXT: s_mul_i32 s9, s9, s12
-; GFX9-NEXT: s_add_i32 s4, s4, s9
-; GFX9-NEXT: s_mul_i32 s8, s8, s12
-; GFX9-NEXT: s_mul_hi_u32 s9, s10, s8
-; GFX9-NEXT: s_mul_i32 s11, s10, s8
-; GFX9-NEXT: s_mul_i32 s14, s12, s4
-; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8
-; GFX9-NEXT: s_mul_hi_u32 s13, s12, s4
-; GFX9-NEXT: s_add_u32 s8, s8, s14
+; GFX9-NEXT: s_add_u32 s9, s9, s10
+; GFX9-NEXT: s_addc_u32 s8, s8, s11
+; GFX9-NEXT: s_mul_i32 s10, s4, s8
+; GFX9-NEXT: s_mul_hi_u32 s11, s4, s9
+; GFX9-NEXT: s_add_i32 s10, s11, s10
+; GFX9-NEXT: s_mul_i32 s5, s5, s9
+; GFX9-NEXT: s_add_i32 s10, s10, s5
+; GFX9-NEXT: s_mul_i32 s4, s4, s9
+; GFX9-NEXT: s_mul_hi_u32 s11, s8, s4
+; GFX9-NEXT: s_mul_i32 s12, s8, s4
+; GFX9-NEXT: s_mul_i32 s14, s9, s10
+; GFX9-NEXT: s_mul_hi_u32 s4, s9, s4
+; GFX9-NEXT: s_mul_hi_u32 s13, s9, s10
+; GFX9-NEXT: s_add_u32 s4, s4, s14
; GFX9-NEXT: s_addc_u32 s13, 0, s13
-; GFX9-NEXT: s_add_u32 s8, s8, s11
-; GFX9-NEXT: s_mul_hi_u32 s5, s10, s4
-; GFX9-NEXT: s_addc_u32 s8, s13, s9
+; GFX9-NEXT: s_add_u32 s4, s4, s12
+; GFX9-NEXT: s_mul_hi_u32 s5, s8, s10
+; GFX9-NEXT: s_addc_u32 s4, s13, s11
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_mul_i32 s4, s10, s4
-; GFX9-NEXT: s_add_u32 s4, s8, s4
-; GFX9-NEXT: s_addc_u32 s8, 0, s5
-; GFX9-NEXT: s_add_u32 s9, s12, s4
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s8, s10, s8
+; GFX9-NEXT: s_mul_i32 s10, s8, s10
+; GFX9-NEXT: s_add_u32 s4, s4, s10
+; GFX9-NEXT: s_addc_u32 s5, 0, s5
+; GFX9-NEXT: s_add_u32 s9, s9, s4
+; GFX9-NEXT: s_addc_u32 s8, s8, s5
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s4, s3, 31
; GFX9-NEXT: s_add_u32 s2, s2, s4
@@ -9319,11 +9267,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: s_mul_i32 s8, s6, s8
; GFX9-NEXT: s_sub_u32 s2, s2, s8
; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_subb_u32 s13, s10, s7
; GFX9-NEXT: s_sub_u32 s14, s2, s6
; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
; GFX9-NEXT: s_subb_u32 s15, s13, 0
; GFX9-NEXT: s_cmp_ge_u32 s15, s7
; GFX9-NEXT: s_cselect_b32 s16, -1, 0
@@ -9332,13 +9278,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: s_cmp_eq_u32 s15, s7
; GFX9-NEXT: s_cselect_b32 s16, s17, s16
; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s13, s13, s7
-; GFX9-NEXT: s_sub_u32 s17, s14, s6
-; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s10, s13, 0
+; GFX9-NEXT: s_subb_u32 s10, s13, s7
+; GFX9-NEXT: s_sub_u32 s11, s14, s6
+; GFX9-NEXT: s_subb_u32 s10, s10, 0
; GFX9-NEXT: s_cmp_lg_u32 s16, 0
-; GFX9-NEXT: s_cselect_b32 s11, s17, s14
+; GFX9-NEXT: s_cselect_b32 s11, s11, s14
; GFX9-NEXT: s_cselect_b32 s10, s10, s15
; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_subb_u32 s3, s3, s12
@@ -9500,10 +9444,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_addc_u32 s15, 0, s16
; GFX6-NEXT: s_add_u32 s16, s6, s7
; GFX6-NEXT: v_mov_b32_e32 v0, s16
-; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX6-NEXT: s_or_b32 s6, s6, s7
-; GFX6-NEXT: s_cmp_lg_u32 s6, 0
; GFX6-NEXT: s_addc_u32 s14, s14, s15
; GFX6-NEXT: s_mul_i32 s6, s12, s14
; GFX6-NEXT: v_readfirstlane_b32 s7, v0
@@ -9534,7 +9477,6 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_add_u32 s13, s16, s6
; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX6-NEXT: s_or_b32 s6, s6, s7
-; GFX6-NEXT: s_cmp_lg_u32 s6, 0
; GFX6-NEXT: s_addc_u32 s12, s14, s12
; GFX6-NEXT: s_ashr_i32 s6, s9, 31
; GFX6-NEXT: s_add_u32 s8, s8, s6
@@ -9567,49 +9509,46 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_readfirstlane_b32 s14, v0
; GFX6-NEXT: s_add_i32 s13, s14, s13
; GFX6-NEXT: s_mul_i32 s14, s3, s12
-; GFX6-NEXT: s_add_i32 s14, s13, s14
-; GFX6-NEXT: s_sub_i32 s15, s9, s14
+; GFX6-NEXT: s_add_i32 s16, s13, s14
+; GFX6-NEXT: s_sub_i32 s14, s9, s16
; GFX6-NEXT: s_mul_i32 s12, s2, s12
; GFX6-NEXT: s_sub_u32 s8, s8, s12
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s16, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s16, 0
-; GFX6-NEXT: s_subb_u32 s15, s15, s3
-; GFX6-NEXT: s_sub_u32 s17, s8, s2
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_subb_u32 s18, s15, 0
-; GFX6-NEXT: s_cmp_ge_u32 s18, s3
-; GFX6-NEXT: s_cselect_b32 s13, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s17, s2
-; GFX6-NEXT: s_cselect_b32 s19, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s18, s3
-; GFX6-NEXT: s_cselect_b32 s19, s19, s13
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_subb_u32 s15, s15, s3
-; GFX6-NEXT: s_sub_u32 s20, s17, s2
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: s_or_b32 s15, s12, s13
+; GFX6-NEXT: s_subb_u32 s17, s14, s3
+; GFX6-NEXT: s_sub_u32 s18, s8, s2
+; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT: s_or_b32 s19, s14, s15
+; GFX6-NEXT: s_subb_u32 s19, s17, 0
+; GFX6-NEXT: s_cmp_ge_u32 s19, s3
+; GFX6-NEXT: s_cselect_b32 s20, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s18, s2
+; GFX6-NEXT: s_cselect_b32 s21, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s19, s3
+; GFX6-NEXT: s_cselect_b32 s20, s21, s20
+; GFX6-NEXT: s_or_b32 s14, s14, s15
+; GFX6-NEXT: s_subb_u32 s17, s17, s3
+; GFX6-NEXT: s_sub_u32 s21, s18, s2
+; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT: s_or_b32 s14, s14, s15
+; GFX6-NEXT: s_subb_u32 s14, s17, 0
+; GFX6-NEXT: s_cmp_lg_u32 s20, 0
+; GFX6-NEXT: s_cselect_b32 s15, s21, s18
+; GFX6-NEXT: s_cselect_b32 s14, s14, s19
; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_subb_u32 s12, s15, 0
-; GFX6-NEXT: s_cmp_lg_u32 s19, 0
-; GFX6-NEXT: s_cselect_b32 s13, s20, s17
-; GFX6-NEXT: s_cselect_b32 s12, s12, s18
-; GFX6-NEXT: s_cmp_lg_u32 s16, 0
-; GFX6-NEXT: s_subb_u32 s9, s9, s14
+; GFX6-NEXT: s_subb_u32 s9, s9, s16
; GFX6-NEXT: s_cmp_ge_u32 s9, s3
-; GFX6-NEXT: s_cselect_b32 s14, -1, 0
+; GFX6-NEXT: s_cselect_b32 s12, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s8, s2
; GFX6-NEXT: s_cselect_b32 s2, -1, 0
; GFX6-NEXT: s_cmp_eq_u32 s9, s3
-; GFX6-NEXT: s_cselect_b32 s2, s2, s14
+; GFX6-NEXT: s_cselect_b32 s2, s2, s12
; GFX6-NEXT: s_cmp_lg_u32 s2, 0
-; GFX6-NEXT: s_cselect_b32 s3, s12, s9
-; GFX6-NEXT: s_cselect_b32 s2, s13, s8
+; GFX6-NEXT: s_cselect_b32 s3, s14, s9
+; GFX6-NEXT: s_cselect_b32 s2, s15, s8
; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7]
-; GFX6-NEXT: s_sub_u32 s12, s2, s6
-; GFX6-NEXT: s_subb_u32 s13, s3, s6
+; GFX6-NEXT: s_sub_u32 s14, s2, s6
+; GFX6-NEXT: s_subb_u32 s15, s3, s6
; GFX6-NEXT: s_ashr_i32 s2, s1, 31
; GFX6-NEXT: s_add_u32 s0, s0, s2
; GFX6-NEXT: s_mov_b32 s3, s2
@@ -9628,40 +9567,39 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0
-; GFX6-NEXT: v_readfirstlane_b32 s14, v1
+; GFX6-NEXT: v_readfirstlane_b32 s12, v1
; GFX6-NEXT: v_readfirstlane_b32 s2, v0
-; GFX6-NEXT: s_mul_i32 s1, s8, s14
+; GFX6-NEXT: s_mul_i32 s1, s8, s12
; GFX6-NEXT: v_readfirstlane_b32 s3, v2
; GFX6-NEXT: s_mul_i32 s0, s9, s2
; GFX6-NEXT: s_add_i32 s1, s3, s1
; GFX6-NEXT: s_add_i32 s3, s1, s0
-; GFX6-NEXT: s_mul_i32 s15, s8, s2
+; GFX6-NEXT: s_mul_i32 s13, s8, s2
; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15
+; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX6-NEXT: s_mul_i32 s4, s2, s3
; GFX6-NEXT: v_readfirstlane_b32 s5, v2
; GFX6-NEXT: v_readfirstlane_b32 s16, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15
+; GFX6-NEXT: v_mul_hi_u32 v0, v1, s13
; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3
; GFX6-NEXT: s_add_u32 s4, s16, s4
; GFX6-NEXT: s_addc_u32 s5, 0, s5
-; GFX6-NEXT: s_mul_i32 s15, s14, s15
+; GFX6-NEXT: s_mul_i32 s13, s12, s13
; GFX6-NEXT: v_readfirstlane_b32 s16, v0
-; GFX6-NEXT: s_add_u32 s4, s4, s15
+; GFX6-NEXT: s_add_u32 s4, s4, s13
; GFX6-NEXT: s_addc_u32 s4, s5, s16
; GFX6-NEXT: v_readfirstlane_b32 s5, v1
; GFX6-NEXT: s_addc_u32 s5, s5, 0
-; GFX6-NEXT: s_mul_i32 s3, s14, s3
+; GFX6-NEXT: s_mul_i32 s3, s12, s3
; GFX6-NEXT: s_add_u32 s3, s4, s3
; GFX6-NEXT: s_addc_u32 s4, 0, s5
; GFX6-NEXT: s_add_u32 s5, s2, s3
; GFX6-NEXT: v_mov_b32_e32 v0, s5
-; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
+; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_cmp_lg_u32 s2, 0
-; GFX6-NEXT: s_addc_u32 s4, s14, s4
+; GFX6-NEXT: s_addc_u32 s4, s12, s4
; GFX6-NEXT: s_mul_i32 s2, s8, s4
; GFX6-NEXT: v_readfirstlane_b32 s3, v0
; GFX6-NEXT: s_add_i32 s2, s3, s2
@@ -9675,102 +9613,98 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0
; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX6-NEXT: s_mul_i32 s9, s5, s2
-; GFX6-NEXT: v_readfirstlane_b32 s15, v2
-; GFX6-NEXT: s_add_u32 s9, s15, s9
-; GFX6-NEXT: v_readfirstlane_b32 s14, v0
+; GFX6-NEXT: v_readfirstlane_b32 s13, v2
+; GFX6-NEXT: s_add_u32 s9, s13, s9
+; GFX6-NEXT: v_readfirstlane_b32 s12, v0
; GFX6-NEXT: s_mul_i32 s3, s4, s3
-; GFX6-NEXT: s_addc_u32 s14, 0, s14
+; GFX6-NEXT: s_addc_u32 s12, 0, s12
; GFX6-NEXT: v_readfirstlane_b32 s8, v3
; GFX6-NEXT: s_add_u32 s3, s9, s3
-; GFX6-NEXT: s_addc_u32 s3, s14, s8
+; GFX6-NEXT: s_addc_u32 s3, s12, s8
; GFX6-NEXT: v_readfirstlane_b32 s8, v1
; GFX6-NEXT: s_addc_u32 s8, s8, 0
; GFX6-NEXT: s_mul_i32 s2, s4, s2
; GFX6-NEXT: s_add_u32 s2, s3, s2
; GFX6-NEXT: s_addc_u32 s8, 0, s8
-; GFX6-NEXT: s_add_u32 s14, s5, s2
+; GFX6-NEXT: s_add_u32 s12, s5, s2
; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_cmp_lg_u32 s2, 0
-; GFX6-NEXT: s_addc_u32 s15, s4, s8
+; GFX6-NEXT: s_addc_u32 s13, s4, s8
; GFX6-NEXT: s_ashr_i32 s4, s11, 31
; GFX6-NEXT: s_add_u32 s2, s10, s4
; GFX6-NEXT: s_mov_b32 s5, s4
; GFX6-NEXT: s_addc_u32 s3, s11, s4
; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v0, s15
+; GFX6-NEXT: v_mov_b32_e32 v0, s13
; GFX6-NEXT: v_mul_hi_u32 v1, s8, v0
-; GFX6-NEXT: v_mov_b32_e32 v2, s14
+; GFX6-NEXT: v_mov_b32_e32 v2, s12
; GFX6-NEXT: v_mul_hi_u32 v3, s8, v2
-; GFX6-NEXT: s_mul_i32 s2, s8, s15
+; GFX6-NEXT: s_mul_i32 s2, s8, s13
; GFX6-NEXT: v_readfirstlane_b32 s10, v1
; GFX6-NEXT: v_mul_hi_u32 v1, s9, v2
; GFX6-NEXT: v_readfirstlane_b32 s11, v3
; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0
; GFX6-NEXT: s_add_u32 s2, s11, s2
; GFX6-NEXT: s_addc_u32 s10, 0, s10
-; GFX6-NEXT: s_mul_i32 s11, s9, s14
-; GFX6-NEXT: v_readfirstlane_b32 s14, v1
+; GFX6-NEXT: s_mul_i32 s11, s9, s12
+; GFX6-NEXT: v_readfirstlane_b32 s12, v1
; GFX6-NEXT: s_add_u32 s2, s2, s11
-; GFX6-NEXT: s_addc_u32 s2, s10, s14
+; GFX6-NEXT: s_addc_u32 s2, s10, s12
; GFX6-NEXT: v_readfirstlane_b32 s10, v0
; GFX6-NEXT: s_addc_u32 s10, s10, 0
-; GFX6-NEXT: s_mul_i32 s11, s9, s15
+; GFX6-NEXT: s_mul_i32 s11, s9, s13
; GFX6-NEXT: s_add_u32 s11, s2, s11
; GFX6-NEXT: v_mov_b32_e32 v0, s11
; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
; GFX6-NEXT: s_addc_u32 s10, 0, s10
; GFX6-NEXT: s_mul_i32 s10, s6, s10
; GFX6-NEXT: s_mov_b32 s3, 0xf000
-; GFX6-NEXT: v_readfirstlane_b32 s14, v0
-; GFX6-NEXT: s_add_i32 s10, s14, s10
-; GFX6-NEXT: s_mul_i32 s14, s7, s11
-; GFX6-NEXT: s_add_i32 s14, s10, s14
-; GFX6-NEXT: s_sub_i32 s15, s9, s14
+; GFX6-NEXT: v_readfirstlane_b32 s12, v0
+; GFX6-NEXT: s_add_i32 s10, s12, s10
+; GFX6-NEXT: s_mul_i32 s12, s7, s11
+; GFX6-NEXT: s_add_i32 s16, s10, s12
+; GFX6-NEXT: s_sub_i32 s12, s9, s16
; GFX6-NEXT: s_mul_i32 s10, s6, s11
; GFX6-NEXT: s_sub_u32 s8, s8, s10
; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX6-NEXT: s_or_b32 s16, s10, s11
-; GFX6-NEXT: s_cmp_lg_u32 s16, 0
-; GFX6-NEXT: s_subb_u32 s15, s15, s7
-; GFX6-NEXT: s_sub_u32 s17, s8, s6
-; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX6-NEXT: s_or_b32 s10, s10, s11
-; GFX6-NEXT: s_cmp_lg_u32 s10, 0
-; GFX6-NEXT: s_subb_u32 s18, s15, 0
-; GFX6-NEXT: s_cmp_ge_u32 s18, s7
-; GFX6-NEXT: s_cselect_b32 s11, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s17, s6
-; GFX6-NEXT: s_cselect_b32 s19, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s18, s7
-; GFX6-NEXT: s_cselect_b32 s19, s19, s11
-; GFX6-NEXT: s_cmp_lg_u32 s10, 0
-; GFX6-NEXT: s_subb_u32 s15, s15, s7
-; GFX6-NEXT: s_sub_u32 s20, s17, s6
-; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GFX6-NEXT: s_or_b32 s13, s10, s11
+; GFX6-NEXT: s_subb_u32 s17, s12, s7
+; GFX6-NEXT: s_sub_u32 s18, s8, s6
+; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: s_or_b32 s19, s12, s13
+; GFX6-NEXT: s_subb_u32 s19, s17, 0
+; GFX6-NEXT: s_cmp_ge_u32 s19, s7
+; GFX6-NEXT: s_cselect_b32 s20, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s18, s6
+; GFX6-NEXT: s_cselect_b32 s21, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s19, s7
+; GFX6-NEXT: s_cselect_b32 s20, s21, s20
+; GFX6-NEXT: s_or_b32 s12, s12, s13
+; GFX6-NEXT: s_subb_u32 s17, s17, s7
+; GFX6-NEXT: s_sub_u32 s21, s18, s6
+; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: s_or_b32 s12, s12, s13
+; GFX6-NEXT: s_subb_u32 s12, s17, 0
+; GFX6-NEXT: s_cmp_lg_u32 s20, 0
+; GFX6-NEXT: s_cselect_b32 s13, s21, s18
+; GFX6-NEXT: s_cselect_b32 s12, s12, s19
; GFX6-NEXT: s_or_b32 s10, s10, s11
-; GFX6-NEXT: s_cmp_lg_u32 s10, 0
-; GFX6-NEXT: s_subb_u32 s10, s15, 0
-; GFX6-NEXT: s_cmp_lg_u32 s19, 0
-; GFX6-NEXT: s_cselect_b32 s11, s20, s17
-; GFX6-NEXT: s_cselect_b32 s10, s10, s18
-; GFX6-NEXT: s_cmp_lg_u32 s16, 0
-; GFX6-NEXT: s_subb_u32 s9, s9, s14
+; GFX6-NEXT: s_subb_u32 s9, s9, s16
; GFX6-NEXT: s_cmp_ge_u32 s9, s7
-; GFX6-NEXT: s_cselect_b32 s14, -1, 0
+; GFX6-NEXT: s_cselect_b32 s10, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s8, s6
; GFX6-NEXT: s_cselect_b32 s6, -1, 0
; GFX6-NEXT: s_cmp_eq_u32 s9, s7
-; GFX6-NEXT: s_cselect_b32 s6, s6, s14
+; GFX6-NEXT: s_cselect_b32 s6, s6, s10
; GFX6-NEXT: s_cmp_lg_u32 s6, 0
-; GFX6-NEXT: s_cselect_b32 s7, s10, s9
-; GFX6-NEXT: s_cselect_b32 s6, s11, s8
+; GFX6-NEXT: s_cselect_b32 s7, s12, s9
+; GFX6-NEXT: s_cselect_b32 s6, s13, s8
; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
; GFX6-NEXT: s_sub_u32 s5, s6, s4
; GFX6-NEXT: s_subb_u32 s4, s7, s4
; GFX6-NEXT: s_mov_b32 s2, -1
-; GFX6-NEXT: v_mov_b32_e32 v0, s12
-; GFX6-NEXT: v_mov_b32_e32 v1, s13
+; GFX6-NEXT: v_mov_b32_e32 v0, s14
+; GFX6-NEXT: v_mov_b32_e32 v1, s15
; GFX6-NEXT: v_mov_b32_e32 v2, s5
; GFX6-NEXT: v_mov_b32_e32 v3, s4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
@@ -9790,8 +9724,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7]
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
-; GFX9-NEXT: s_sub_u32 s12, 0, s2
-; GFX9-NEXT: s_subb_u32 s13, 0, s3
+; GFX9-NEXT: s_sub_u32 s6, 0, s2
+; GFX9-NEXT: s_subb_u32 s7, 0, s3
; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX9-NEXT: v_rcp_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -9800,56 +9734,52 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s14, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v0
-; GFX9-NEXT: s_mul_i32 s7, s12, s14
-; GFX9-NEXT: s_mul_hi_u32 s16, s12, s6
-; GFX9-NEXT: s_mul_i32 s15, s13, s6
-; GFX9-NEXT: s_add_i32 s7, s16, s7
-; GFX9-NEXT: s_mul_i32 s17, s12, s6
-; GFX9-NEXT: s_add_i32 s7, s7, s15
-; GFX9-NEXT: s_mul_hi_u32 s16, s6, s17
-; GFX9-NEXT: s_mul_i32 s18, s6, s7
-; GFX9-NEXT: s_mul_hi_u32 s15, s6, s7
+; GFX9-NEXT: v_readfirstlane_b32 s12, v1
+; GFX9-NEXT: v_readfirstlane_b32 s13, v0
+; GFX9-NEXT: s_mul_i32 s14, s6, s12
+; GFX9-NEXT: s_mul_hi_u32 s16, s6, s13
+; GFX9-NEXT: s_mul_i32 s15, s7, s13
+; GFX9-NEXT: s_add_i32 s14, s16, s14
+; GFX9-NEXT: s_mul_i32 s17, s6, s13
+; GFX9-NEXT: s_add_i32 s14, s14, s15
+; GFX9-NEXT: s_mul_hi_u32 s16, s13, s17
+; GFX9-NEXT: s_mul_i32 s18, s13, s14
+; GFX9-NEXT: s_mul_hi_u32 s15, s13, s14
; GFX9-NEXT: s_add_u32 s16, s16, s18
; GFX9-NEXT: s_addc_u32 s15, 0, s15
-; GFX9-NEXT: s_mul_hi_u32 s18, s14, s17
-; GFX9-NEXT: s_mul_i32 s17, s14, s17
+; GFX9-NEXT: s_mul_hi_u32 s18, s12, s17
+; GFX9-NEXT: s_mul_i32 s17, s12, s17
; GFX9-NEXT: s_add_u32 s16, s16, s17
-; GFX9-NEXT: s_mul_hi_u32 s19, s14, s7
+; GFX9-NEXT: s_mul_hi_u32 s19, s12, s14
; GFX9-NEXT: s_addc_u32 s15, s15, s18
; GFX9-NEXT: s_addc_u32 s16, s19, 0
-; GFX9-NEXT: s_mul_i32 s7, s14, s7
-; GFX9-NEXT: s_add_u32 s7, s15, s7
+; GFX9-NEXT: s_mul_i32 s14, s12, s14
+; GFX9-NEXT: s_add_u32 s14, s15, s14
; GFX9-NEXT: s_addc_u32 s15, 0, s16
-; GFX9-NEXT: s_add_u32 s16, s6, s7
-; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX9-NEXT: s_addc_u32 s14, s14, s15
-; GFX9-NEXT: s_mul_i32 s6, s12, s14
-; GFX9-NEXT: s_mul_hi_u32 s7, s12, s16
-; GFX9-NEXT: s_add_i32 s6, s7, s6
-; GFX9-NEXT: s_mul_i32 s13, s13, s16
-; GFX9-NEXT: s_add_i32 s6, s6, s13
-; GFX9-NEXT: s_mul_i32 s12, s12, s16
-; GFX9-NEXT: s_mul_hi_u32 s13, s14, s12
-; GFX9-NEXT: s_mul_i32 s15, s14, s12
-; GFX9-NEXT: s_mul_i32 s18, s16, s6
-; GFX9-NEXT: s_mul_hi_u32 s12, s16, s12
-; GFX9-NEXT: s_mul_hi_u32 s17, s16, s6
-; GFX9-NEXT: s_add_u32 s12, s12, s18
+; GFX9-NEXT: s_add_u32 s13, s13, s14
+; GFX9-NEXT: s_addc_u32 s12, s12, s15
+; GFX9-NEXT: s_mul_i32 s14, s6, s12
+; GFX9-NEXT: s_mul_hi_u32 s15, s6, s13
+; GFX9-NEXT: s_add_i32 s14, s15, s14
+; GFX9-NEXT: s_mul_i32 s7, s7, s13
+; GFX9-NEXT: s_add_i32 s14, s14, s7
+; GFX9-NEXT: s_mul_i32 s6, s6, s13
+; GFX9-NEXT: s_mul_hi_u32 s15, s12, s6
+; GFX9-NEXT: s_mul_i32 s16, s12, s6
+; GFX9-NEXT: s_mul_i32 s18, s13, s14
+; GFX9-NEXT: s_mul_hi_u32 s6, s13, s6
+; GFX9-NEXT: s_mul_hi_u32 s17, s13, s14
+; GFX9-NEXT: s_add_u32 s6, s6, s18
; GFX9-NEXT: s_addc_u32 s17, 0, s17
-; GFX9-NEXT: s_add_u32 s12, s12, s15
-; GFX9-NEXT: s_mul_hi_u32 s7, s14, s6
-; GFX9-NEXT: s_addc_u32 s12, s17, s13
+; GFX9-NEXT: s_add_u32 s6, s6, s16
+; GFX9-NEXT: s_mul_hi_u32 s7, s12, s14
+; GFX9-NEXT: s_addc_u32 s6, s17, s15
; GFX9-NEXT: s_addc_u32 s7, s7, 0
-; GFX9-NEXT: s_mul_i32 s6, s14, s6
-; GFX9-NEXT: s_add_u32 s6, s12, s6
-; GFX9-NEXT: s_addc_u32 s12, 0, s7
-; GFX9-NEXT: s_add_u32 s13, s16, s6
-; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX9-NEXT: s_addc_u32 s12, s14, s12
+; GFX9-NEXT: s_mul_i32 s14, s12, s14
+; GFX9-NEXT: s_add_u32 s6, s6, s14
+; GFX9-NEXT: s_addc_u32 s7, 0, s7
+; GFX9-NEXT: s_add_u32 s13, s13, s6
+; GFX9-NEXT: s_addc_u32 s12, s12, s7
; GFX9-NEXT: s_ashr_i32 s6, s9, 31
; GFX9-NEXT: s_add_u32 s8, s8, s6
; GFX9-NEXT: s_mov_b32 s7, s6
@@ -9878,11 +9808,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_mul_i32 s12, s2, s12
; GFX9-NEXT: s_sub_u32 s8, s8, s12
; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
; GFX9-NEXT: s_subb_u32 s17, s14, s3
; GFX9-NEXT: s_sub_u32 s18, s8, s2
; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
; GFX9-NEXT: s_subb_u32 s19, s17, 0
; GFX9-NEXT: s_cmp_ge_u32 s19, s3
; GFX9-NEXT: s_cselect_b32 s20, -1, 0
@@ -9891,13 +9819,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cmp_eq_u32 s19, s3
; GFX9-NEXT: s_cselect_b32 s20, s21, s20
; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT: s_subb_u32 s17, s17, s3
-; GFX9-NEXT: s_sub_u32 s21, s18, s2
-; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT: s_subb_u32 s14, s17, 0
+; GFX9-NEXT: s_subb_u32 s14, s17, s3
+; GFX9-NEXT: s_sub_u32 s15, s18, s2
+; GFX9-NEXT: s_subb_u32 s14, s14, 0
; GFX9-NEXT: s_cmp_lg_u32 s20, 0
-; GFX9-NEXT: s_cselect_b32 s15, s21, s18
+; GFX9-NEXT: s_cselect_b32 s15, s15, s18
; GFX9-NEXT: s_cselect_b32 s14, s14, s19
; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
; GFX9-NEXT: s_subb_u32 s9, s9, s16
@@ -9921,8 +9847,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT: s_sub_u32 s6, 0, s2
-; GFX9-NEXT: s_subb_u32 s7, 0, s3
+; GFX9-NEXT: s_sub_u32 s4, 0, s2
+; GFX9-NEXT: s_subb_u32 s5, 0, s3
; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -9932,74 +9858,70 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: v_readfirstlane_b32 s6, v1
; GFX9-NEXT: v_readfirstlane_b32 s9, v2
-; GFX9-NEXT: s_mul_hi_u32 s8, s6, s4
-; GFX9-NEXT: s_mul_i32 s14, s6, s9
-; GFX9-NEXT: s_mul_i32 s5, s7, s4
+; GFX9-NEXT: s_mul_hi_u32 s8, s4, s6
+; GFX9-NEXT: s_mul_i32 s14, s4, s9
+; GFX9-NEXT: s_mul_i32 s7, s5, s6
; GFX9-NEXT: s_add_i32 s8, s8, s14
-; GFX9-NEXT: s_add_i32 s8, s8, s5
-; GFX9-NEXT: s_mul_i32 s15, s6, s4
-; GFX9-NEXT: s_mul_i32 s14, s4, s8
-; GFX9-NEXT: s_mul_hi_u32 s16, s4, s15
-; GFX9-NEXT: s_mul_hi_u32 s5, s4, s8
+; GFX9-NEXT: s_add_i32 s8, s8, s7
+; GFX9-NEXT: s_mul_i32 s15, s4, s6
+; GFX9-NEXT: s_mul_i32 s14, s6, s8
+; GFX9-NEXT: s_mul_hi_u32 s16, s6, s15
+; GFX9-NEXT: s_mul_hi_u32 s7, s6, s8
; GFX9-NEXT: s_add_u32 s14, s16, s14
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
+; GFX9-NEXT: s_addc_u32 s7, 0, s7
; GFX9-NEXT: s_mul_hi_u32 s17, s9, s15
; GFX9-NEXT: s_mul_i32 s15, s9, s15
; GFX9-NEXT: s_add_u32 s14, s14, s15
; GFX9-NEXT: s_mul_hi_u32 s16, s9, s8
-; GFX9-NEXT: s_addc_u32 s5, s5, s17
+; GFX9-NEXT: s_addc_u32 s7, s7, s17
; GFX9-NEXT: s_addc_u32 s14, s16, 0
; GFX9-NEXT: s_mul_i32 s8, s9, s8
-; GFX9-NEXT: s_add_u32 s5, s5, s8
+; GFX9-NEXT: s_add_u32 s7, s7, s8
; GFX9-NEXT: s_addc_u32 s8, 0, s14
-; GFX9-NEXT: s_add_u32 s14, s4, s5
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s8, s9, s8
-; GFX9-NEXT: s_mul_i32 s4, s6, s8
-; GFX9-NEXT: s_mul_hi_u32 s5, s6, s14
-; GFX9-NEXT: s_add_i32 s4, s5, s4
-; GFX9-NEXT: s_mul_i32 s7, s7, s14
-; GFX9-NEXT: s_add_i32 s4, s4, s7
-; GFX9-NEXT: s_mul_i32 s6, s6, s14
-; GFX9-NEXT: s_mul_hi_u32 s7, s8, s6
-; GFX9-NEXT: s_mul_i32 s9, s8, s6
-; GFX9-NEXT: s_mul_i32 s16, s14, s4
-; GFX9-NEXT: s_mul_hi_u32 s6, s14, s6
-; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4
-; GFX9-NEXT: s_add_u32 s6, s6, s16
+; GFX9-NEXT: s_add_u32 s6, s6, s7
+; GFX9-NEXT: s_addc_u32 s7, s9, s8
+; GFX9-NEXT: s_mul_i32 s8, s4, s7
+; GFX9-NEXT: s_mul_hi_u32 s9, s4, s6
+; GFX9-NEXT: s_add_i32 s8, s9, s8
+; GFX9-NEXT: s_mul_i32 s5, s5, s6
+; GFX9-NEXT: s_add_i32 s8, s8, s5
+; GFX9-NEXT: s_mul_i32 s4, s4, s6
+; GFX9-NEXT: s_mul_hi_u32 s9, s7, s4
+; GFX9-NEXT: s_mul_i32 s14, s7, s4
+; GFX9-NEXT: s_mul_i32 s16, s6, s8
+; GFX9-NEXT: s_mul_hi_u32 s4, s6, s4
+; GFX9-NEXT: s_mul_hi_u32 s15, s6, s8
+; GFX9-NEXT: s_add_u32 s4, s4, s16
; GFX9-NEXT: s_addc_u32 s15, 0, s15
-; GFX9-NEXT: s_add_u32 s6, s6, s9
-; GFX9-NEXT: s_mul_hi_u32 s5, s8, s4
-; GFX9-NEXT: s_addc_u32 s6, s15, s7
+; GFX9-NEXT: s_add_u32 s4, s4, s14
+; GFX9-NEXT: s_mul_hi_u32 s5, s7, s8
+; GFX9-NEXT: s_addc_u32 s4, s15, s9
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_mul_i32 s4, s8, s4
-; GFX9-NEXT: s_add_u32 s4, s6, s4
-; GFX9-NEXT: s_addc_u32 s6, 0, s5
-; GFX9-NEXT: s_add_u32 s9, s14, s4
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s8, s8, s6
+; GFX9-NEXT: s_mul_i32 s8, s7, s8
+; GFX9-NEXT: s_add_u32 s4, s4, s8
+; GFX9-NEXT: s_addc_u32 s5, 0, s5
+; GFX9-NEXT: s_add_u32 s8, s6, s4
+; GFX9-NEXT: s_addc_u32 s9, s7, s5
; GFX9-NEXT: s_ashr_i32 s4, s11, 31
; GFX9-NEXT: s_add_u32 s6, s10, s4
; GFX9-NEXT: s_mov_b32 s5, s4
; GFX9-NEXT: s_addc_u32 s7, s11, s4
; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
-; GFX9-NEXT: s_mul_i32 s11, s6, s8
-; GFX9-NEXT: s_mul_hi_u32 s14, s6, s9
-; GFX9-NEXT: s_mul_hi_u32 s10, s6, s8
+; GFX9-NEXT: s_mul_i32 s11, s6, s9
+; GFX9-NEXT: s_mul_hi_u32 s14, s6, s8
+; GFX9-NEXT: s_mul_hi_u32 s10, s6, s9
; GFX9-NEXT: s_add_u32 s11, s14, s11
; GFX9-NEXT: s_addc_u32 s10, 0, s10
-; GFX9-NEXT: s_mul_hi_u32 s15, s7, s9
-; GFX9-NEXT: s_mul_i32 s9, s7, s9
-; GFX9-NEXT: s_add_u32 s9, s11, s9
-; GFX9-NEXT: s_mul_hi_u32 s14, s7, s8
-; GFX9-NEXT: s_addc_u32 s9, s10, s15
-; GFX9-NEXT: s_addc_u32 s10, s14, 0
+; GFX9-NEXT: s_mul_hi_u32 s15, s7, s8
; GFX9-NEXT: s_mul_i32 s8, s7, s8
-; GFX9-NEXT: s_add_u32 s8, s9, s8
+; GFX9-NEXT: s_add_u32 s8, s11, s8
+; GFX9-NEXT: s_mul_hi_u32 s14, s7, s9
+; GFX9-NEXT: s_addc_u32 s8, s10, s15
+; GFX9-NEXT: s_addc_u32 s10, s14, 0
+; GFX9-NEXT: s_mul_i32 s9, s7, s9
+; GFX9-NEXT: s_add_u32 s8, s8, s9
; GFX9-NEXT: s_addc_u32 s9, 0, s10
; GFX9-NEXT: s_mul_i32 s9, s2, s9
; GFX9-NEXT: s_mul_hi_u32 s10, s2, s8
@@ -10010,11 +9932,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_mul_i32 s8, s2, s8
; GFX9-NEXT: s_sub_u32 s6, s6, s8
; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_subb_u32 s15, s10, s3
; GFX9-NEXT: s_sub_u32 s16, s6, s2
; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
; GFX9-NEXT: s_subb_u32 s17, s15, 0
; GFX9-NEXT: s_cmp_ge_u32 s17, s3
; GFX9-NEXT: s_cselect_b32 s18, -1, 0
@@ -10023,13 +9943,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cmp_eq_u32 s17, s3
; GFX9-NEXT: s_cselect_b32 s18, s19, s18
; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s15, s15, s3
-; GFX9-NEXT: s_sub_u32 s19, s16, s2
-; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s10, s15, 0
+; GFX9-NEXT: s_subb_u32 s10, s15, s3
+; GFX9-NEXT: s_sub_u32 s11, s16, s2
+; GFX9-NEXT: s_subb_u32 s10, s10, 0
; GFX9-NEXT: s_cmp_lg_u32 s18, 0
-; GFX9-NEXT: s_cselect_b32 s11, s19, s16
+; GFX9-NEXT: s_cselect_b32 s11, s11, s16
; GFX9-NEXT: s_cselect_b32 s10, s10, s17
; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_subb_u32 s7, s7, s14