diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll | 1400 |
1 files changed, 659 insertions, 741 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index e27164c..51df8c3 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -6191,37 +6191,34 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 -; GFX6-NEXT: s_ashr_i32 s8, s3, 31 -; GFX6-NEXT: s_add_i32 s3, s3, s8 -; GFX6-NEXT: s_xor_b32 s3, s3, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX6-NEXT: s_sub_i32 s4, 0, s3 -; GFX6-NEXT: s_ashr_i32 s9, s2, 31 -; GFX6-NEXT: s_add_i32 s2, s2, s9 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_xor_b32 s2, s2, s9 +; GFX6-NEXT: s_abs_i32 s8, s3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX6-NEXT: s_sub_i32 s4, 0, s8 +; GFX6-NEXT: s_abs_i32 s9, s2 ; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s0, s0, s3 -; GFX6-NEXT: s_sub_i32 s0, s2, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s3 +; GFX6-NEXT: s_mul_i32 s0, s0, s8 +; GFX6-NEXT: s_sub_i32 s0, s9, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s8 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s0, s3 +; GFX6-NEXT: s_cmp_ge_u32 s0, s8 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: s_cselect_b32 s0, s1, s0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s0, s3 +; GFX6-NEXT: s_cmp_ge_u32 s0, s8 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: s_xor_b32 s0, s2, s3 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: s_xor_b32 s0, s9, s8 +; GFX6-NEXT: s_ashr_i32 s0, s0, 31 ; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -6233,35 +6230,32 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 -; GFX9-NEXT: s_ashr_i32 s4, s3, 31 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_xor_b32 s3, s3, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s6, 0, s3 -; GFX9-NEXT: s_ashr_i32 s5, s2, 31 -; GFX9-NEXT: s_add_i32 s2, s2, s5 +; GFX9-NEXT: s_abs_i32 s4, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_sub_i32 s6, 0, s4 +; GFX9-NEXT: s_abs_i32 s5, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s5 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s7, v0 ; GFX9-NEXT: s_mul_i32 s6, s6, s7 ; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 ; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_mul_hi_u32 s6, s2, s7 -; GFX9-NEXT: s_mul_i32 s8, s6, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s8 +; GFX9-NEXT: s_mul_hi_u32 s6, s5, s7 +; GFX9-NEXT: s_mul_i32 s8, s6, s4 +; GFX9-NEXT: s_sub_i32 s5, s5, s8 ; GFX9-NEXT: s_add_i32 s7, s6, 1 -; GFX9-NEXT: s_sub_i32 s8, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_sub_i32 s8, s5, s4 +; GFX9-NEXT: s_cmp_ge_u32 s5, s4 ; GFX9-NEXT: s_cselect_b32 s6, s7, s6 -; GFX9-NEXT: s_cselect_b32 s2, s8, s2 +; GFX9-NEXT: s_cselect_b32 s5, s8, s5 ; GFX9-NEXT: s_add_i32 s7, s6, 1 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s7, s6 -; GFX9-NEXT: s_xor_b32 s3, s5, s4 +; GFX9-NEXT: s_cmp_ge_u32 s5, s4 +; GFX9-NEXT: s_cselect_b32 s4, s7, s6 ; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: s_ashr_i32 s2, s2, 31 +; GFX9-NEXT: s_xor_b32 s3, s4, s2 +; GFX9-NEXT: s_sub_i32 s2, s3, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -6706,38 +6700,37 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; GFX6-LABEL: srem_i32_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 -; GFX6-NEXT: s_ashr_i32 s4, s3, 31 -; GFX6-NEXT: s_add_i32 s3, s3, s4 -; GFX6-NEXT: s_xor_b32 s4, s3, s4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX6-NEXT: s_sub_i32 s3, 0, s4 -; GFX6-NEXT: s_ashr_i32 s5, s2, 31 -; GFX6-NEXT: s_add_i32 s2, s2, s5 +; GFX6-NEXT: s_abs_i32 s3, s3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX6-NEXT: s_sub_i32 s4, 0, s3 +; GFX6-NEXT: s_abs_i32 s8, s2 +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_xor_b32 s6, s2, s5 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 +; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX6-NEXT: v_readfirstlane_b32 s7, v0 -; GFX6-NEXT: s_mul_i32 s7, s7, s4 -; GFX6-NEXT: s_sub_i32 s6, s6, s7 -; GFX6-NEXT: s_sub_i32 s7, s6, s4 -; GFX6-NEXT: s_cmp_ge_u32 s6, s4 -; GFX6-NEXT: s_cselect_b32 s6, s7, s6 -; GFX6-NEXT: s_sub_i32 s7, s6, s4 -; GFX6-NEXT: s_cmp_ge_u32 s6, s4 -; GFX6-NEXT: s_cselect_b32 s4, s7, s6 -; GFX6-NEXT: s_xor_b32 s4, s4, s5 -; GFX6-NEXT: s_sub_i32 s4, s4, s5 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s0, s0, s3 +; GFX6-NEXT: s_sub_i32 s0, s8, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s3 +; GFX6-NEXT: s_cmp_ge_u32 s0, s3 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s3 +; GFX6-NEXT: s_cmp_ge_u32 s0, s3 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_ashr_i32 s1, s2, 31 +; GFX6-NEXT: s_xor_b32 s0, s0, s1 +; GFX6-NEXT: s_sub_i32 s0, s0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_i32_pow2_shl_denom: @@ -6746,32 +6739,29 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 -; GFX9-NEXT: s_ashr_i32 s4, s3, 31 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_xor_b32 s3, s3, s4 +; GFX9-NEXT: s_abs_i32 s3, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX9-NEXT: s_sub_i32 s5, 0, s3 -; GFX9-NEXT: s_ashr_i32 s4, s2, 31 -; GFX9-NEXT: s_add_i32 s2, s2, s4 +; GFX9-NEXT: s_abs_i32 s4, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 ; GFX9-NEXT: s_mul_i32 s5, s5, s6 ; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 ; GFX9-NEXT: s_add_i32 s6, s6, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 +; GFX9-NEXT: s_mul_hi_u32 s5, s4, s6 ; GFX9-NEXT: s_mul_i32 s5, s5, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-NEXT: s_sub_i32 s5, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_sub_i32 s5, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_xor_b32 s2, s2, s4 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 +; GFX9-NEXT: s_sub_i32 s4, s4, s5 +; GFX9-NEXT: s_sub_i32 s5, s4, s3 +; GFX9-NEXT: s_cmp_ge_u32 s4, s3 +; GFX9-NEXT: s_cselect_b32 s4, s5, s4 +; GFX9-NEXT: s_sub_i32 s5, s4, s3 +; GFX9-NEXT: s_cmp_ge_u32 s4, s3 +; GFX9-NEXT: s_cselect_b32 s3, s5, s4 +; GFX9-NEXT: s_ashr_i32 s2, s2, 31 +; GFX9-NEXT: s_xor_b32 s3, s3, s2 +; GFX9-NEXT: s_sub_i32 s2, s3, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -7831,10 +7821,9 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_addc_u32 s15, 0, s16 ; GFX6-NEXT: s_add_u32 s16, s0, s1 ; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_addc_u32 s14, s14, s15 ; GFX6-NEXT: s_mul_i32 s0, s12, s14 ; GFX6-NEXT: v_readfirstlane_b32 s1, v0 @@ -7865,7 +7854,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_add_u32 s15, s16, s0 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_addc_u32 s14, s14, s12 ; GFX6-NEXT: s_ashr_i32 s12, s7, 31 ; GFX6-NEXT: s_add_u32 s0, s6, s12 @@ -7891,52 +7879,50 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: v_readfirstlane_b32 s4, v0 ; GFX6-NEXT: s_addc_u32 s4, s4, 0 ; GFX6-NEXT: s_mul_i32 s14, s7, s14 -; GFX6-NEXT: s_add_u32 s14, s1, s14 -; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: s_add_u32 s16, s1, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 ; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 -; GFX6-NEXT: s_addc_u32 s15, 0, s4 +; GFX6-NEXT: s_addc_u32 s17, 0, s4 ; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_mul_i32 s4, s10, s15 +; GFX6-NEXT: s_mul_i32 s4, s10, s17 ; GFX6-NEXT: v_readfirstlane_b32 s5, v0 ; GFX6-NEXT: s_add_i32 s4, s5, s4 -; GFX6-NEXT: s_mul_i32 s5, s11, s14 -; GFX6-NEXT: s_add_i32 s16, s4, s5 -; GFX6-NEXT: s_sub_i32 s17, s7, s16 -; GFX6-NEXT: s_mul_i32 s4, s10, s14 +; GFX6-NEXT: s_mul_i32 s5, s11, s16 +; GFX6-NEXT: s_add_i32 s18, s4, s5 +; GFX6-NEXT: s_sub_i32 s14, s7, s18 +; GFX6-NEXT: s_mul_i32 s4, s10, s16 ; GFX6-NEXT: s_sub_u32 s6, s6, s4 ; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX6-NEXT: s_or_b32 s18, s4, s5 -; GFX6-NEXT: s_cmp_lg_u32 s18, 0 -; GFX6-NEXT: s_subb_u32 s17, s17, s11 -; GFX6-NEXT: s_sub_u32 s19, s6, s10 -; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX6-NEXT: s_or_b32 s15, s4, s5 +; GFX6-NEXT: s_subb_u32 s19, s14, s11 +; GFX6-NEXT: s_sub_u32 s20, s6, s10 +; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX6-NEXT: s_or_b32 s14, s14, s15 +; GFX6-NEXT: s_subb_u32 s14, s19, 0 +; GFX6-NEXT: s_cmp_ge_u32 s14, s11 +; GFX6-NEXT: s_cselect_b32 s15, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s20, s10 +; GFX6-NEXT: s_cselect_b32 s19, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s14, s11 +; GFX6-NEXT: s_cselect_b32 s14, s19, s15 +; GFX6-NEXT: s_add_u32 s15, s16, 1 +; GFX6-NEXT: s_addc_u32 s19, s17, 0 +; GFX6-NEXT: s_add_u32 s20, s16, 2 +; GFX6-NEXT: s_addc_u32 s21, s17, 0 +; GFX6-NEXT: s_cmp_lg_u32 s14, 0 +; GFX6-NEXT: s_cselect_b32 s14, s20, s15 +; GFX6-NEXT: s_cselect_b32 s15, s21, s19 ; GFX6-NEXT: s_or_b32 s4, s4, s5 -; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_subb_u32 s4, s17, 0 +; GFX6-NEXT: s_subb_u32 s4, s7, s18 ; GFX6-NEXT: s_cmp_ge_u32 s4, s11 ; GFX6-NEXT: s_cselect_b32 s5, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s19, s10 -; GFX6-NEXT: s_cselect_b32 s17, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s4, s11 -; GFX6-NEXT: s_cselect_b32 s4, s17, s5 -; GFX6-NEXT: s_add_u32 s5, s14, 1 -; GFX6-NEXT: s_addc_u32 s17, s15, 0 -; GFX6-NEXT: s_add_u32 s19, s14, 2 -; GFX6-NEXT: s_addc_u32 s20, s15, 0 -; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_cselect_b32 s4, s19, s5 -; GFX6-NEXT: s_cselect_b32 s5, s20, s17 -; GFX6-NEXT: s_cmp_lg_u32 s18, 0 -; GFX6-NEXT: s_subb_u32 s7, s7, s16 -; GFX6-NEXT: s_cmp_ge_u32 s7, s11 -; GFX6-NEXT: s_cselect_b32 s16, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s6, s10 ; GFX6-NEXT: s_cselect_b32 s6, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s7, s11 -; GFX6-NEXT: s_cselect_b32 s6, s6, s16 -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_cselect_b32 s5, s5, s15 -; GFX6-NEXT: s_cselect_b32 s4, s4, s14 +; GFX6-NEXT: s_cmp_eq_u32 s4, s11 +; GFX6-NEXT: s_cselect_b32 s4, s6, s5 +; GFX6-NEXT: s_cmp_lg_u32 s4, 0 +; GFX6-NEXT: s_cselect_b32 s5, s15, s17 +; GFX6-NEXT: s_cselect_b32 s4, s14, s16 ; GFX6-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9] ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX6-NEXT: s_sub_u32 s4, s4, s6 @@ -7959,8 +7945,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_sub_u32 s10, 0, s8 -; GFX9-NEXT: s_subb_u32 s11, 0, s9 +; GFX9-NEXT: s_sub_u32 s4, 0, s8 +; GFX9-NEXT: s_subb_u32 s5, 0, s9 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -7970,56 +7956,52 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s12, v2 -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_mul_i32 s5, s10, s12 -; GFX9-NEXT: s_mul_hi_u32 s14, s10, s4 -; GFX9-NEXT: s_mul_i32 s13, s11, s4 -; GFX9-NEXT: s_add_i32 s5, s14, s5 -; GFX9-NEXT: s_mul_i32 s15, s10, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s13 -; GFX9-NEXT: s_mul_hi_u32 s14, s4, s15 -; GFX9-NEXT: s_mul_i32 s16, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s13, s4, s5 +; GFX9-NEXT: v_readfirstlane_b32 s10, v2 +; GFX9-NEXT: v_readfirstlane_b32 s11, v1 +; GFX9-NEXT: s_mul_i32 s12, s4, s10 +; GFX9-NEXT: s_mul_hi_u32 s14, s4, s11 +; GFX9-NEXT: s_mul_i32 s13, s5, s11 +; GFX9-NEXT: s_add_i32 s12, s14, s12 +; GFX9-NEXT: s_mul_i32 s15, s4, s11 +; GFX9-NEXT: s_add_i32 s12, s12, s13 +; GFX9-NEXT: s_mul_hi_u32 s14, s11, s15 +; GFX9-NEXT: s_mul_i32 s16, s11, s12 +; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12 ; GFX9-NEXT: s_add_u32 s14, s14, s16 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15 -; GFX9-NEXT: s_mul_i32 s15, s12, s15 +; GFX9-NEXT: s_mul_hi_u32 s17, s10, s15 +; GFX9-NEXT: s_mul_i32 s15, s10, s15 ; GFX9-NEXT: s_add_u32 s14, s14, s15 -; GFX9-NEXT: s_mul_hi_u32 s16, s12, s5 +; GFX9-NEXT: s_mul_hi_u32 s16, s10, s12 ; GFX9-NEXT: s_addc_u32 s13, s13, s17 ; GFX9-NEXT: s_addc_u32 s14, s16, 0 -; GFX9-NEXT: s_mul_i32 s5, s12, s5 -; GFX9-NEXT: s_add_u32 s5, s13, s5 +; GFX9-NEXT: s_mul_i32 s12, s10, s12 +; GFX9-NEXT: s_add_u32 s12, s13, s12 ; GFX9-NEXT: s_addc_u32 s13, 0, s14 -; GFX9-NEXT: s_add_u32 s14, s4, s5 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s12, s12, s13 -; GFX9-NEXT: s_mul_i32 s4, s10, s12 -; GFX9-NEXT: s_mul_hi_u32 s5, s10, s14 -; GFX9-NEXT: s_add_i32 s4, s5, s4 -; GFX9-NEXT: s_mul_i32 s11, s11, s14 -; GFX9-NEXT: s_add_i32 s4, s4, s11 -; GFX9-NEXT: s_mul_i32 s10, s10, s14 -; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10 -; GFX9-NEXT: s_mul_i32 s13, s12, s10 -; GFX9-NEXT: s_mul_i32 s16, s14, s4 -; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10 -; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4 -; GFX9-NEXT: s_add_u32 s10, s10, s16 +; GFX9-NEXT: s_add_u32 s11, s11, s12 +; GFX9-NEXT: s_addc_u32 s10, s10, s13 +; GFX9-NEXT: s_mul_i32 s12, s4, s10 +; GFX9-NEXT: s_mul_hi_u32 s13, s4, s11 +; GFX9-NEXT: s_add_i32 s12, s13, s12 +; GFX9-NEXT: s_mul_i32 s5, s5, s11 +; GFX9-NEXT: s_add_i32 s12, s12, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s11 +; GFX9-NEXT: s_mul_hi_u32 s13, s10, s4 +; GFX9-NEXT: s_mul_i32 s14, s10, s4 +; GFX9-NEXT: s_mul_i32 s16, s11, s12 +; GFX9-NEXT: s_mul_hi_u32 s4, s11, s4 +; GFX9-NEXT: s_mul_hi_u32 s15, s11, s12 +; GFX9-NEXT: s_add_u32 s4, s4, s16 ; GFX9-NEXT: s_addc_u32 s15, 0, s15 -; GFX9-NEXT: s_add_u32 s10, s10, s13 -; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4 -; GFX9-NEXT: s_addc_u32 s10, s15, s11 +; GFX9-NEXT: s_add_u32 s4, s4, s14 +; GFX9-NEXT: s_mul_hi_u32 s5, s10, s12 +; GFX9-NEXT: s_addc_u32 s4, s15, s13 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_mul_i32 s4, s12, s4 -; GFX9-NEXT: s_add_u32 s4, s10, s4 -; GFX9-NEXT: s_addc_u32 s10, 0, s5 -; GFX9-NEXT: s_add_u32 s11, s14, s4 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s10, s12, s10 +; GFX9-NEXT: s_mul_i32 s12, s10, s12 +; GFX9-NEXT: s_add_u32 s4, s4, s12 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_add_u32 s11, s11, s4 +; GFX9-NEXT: s_addc_u32 s10, s10, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: s_add_u32 s2, s2, s4 @@ -8038,38 +8020,35 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_addc_u32 s11, s12, s15 ; GFX9-NEXT: s_addc_u32 s12, s14, 0 ; GFX9-NEXT: s_mul_i32 s10, s3, s10 -; GFX9-NEXT: s_add_u32 s14, s11, s10 -; GFX9-NEXT: s_addc_u32 s15, 0, s12 -; GFX9-NEXT: s_mul_i32 s10, s8, s15 -; GFX9-NEXT: s_mul_hi_u32 s11, s8, s14 +; GFX9-NEXT: s_add_u32 s13, s11, s10 +; GFX9-NEXT: s_addc_u32 s12, 0, s12 +; GFX9-NEXT: s_mul_i32 s10, s8, s12 +; GFX9-NEXT: s_mul_hi_u32 s11, s8, s13 ; GFX9-NEXT: s_add_i32 s10, s11, s10 -; GFX9-NEXT: s_mul_i32 s11, s9, s14 -; GFX9-NEXT: s_add_i32 s16, s10, s11 -; GFX9-NEXT: s_sub_i32 s12, s3, s16 -; GFX9-NEXT: s_mul_i32 s10, s8, s14 +; GFX9-NEXT: s_mul_i32 s11, s9, s13 +; GFX9-NEXT: s_add_i32 s14, s10, s11 +; GFX9-NEXT: s_sub_i32 s15, s3, s14 +; GFX9-NEXT: s_mul_i32 s10, s8, s13 ; GFX9-NEXT: s_sub_u32 s2, s2, s10 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s17, s12, s9 -; GFX9-NEXT: s_sub_u32 s18, s2, s8 -; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GFX9-NEXT: s_subb_u32 s12, s17, 0 -; GFX9-NEXT: s_cmp_ge_u32 s12, s9 -; GFX9-NEXT: s_cselect_b32 s13, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s18, s8 +; GFX9-NEXT: s_subb_u32 s15, s15, s9 +; GFX9-NEXT: s_sub_u32 s16, s2, s8 +; GFX9-NEXT: s_subb_u32 s15, s15, 0 +; GFX9-NEXT: s_cmp_ge_u32 s15, s9 ; GFX9-NEXT: s_cselect_b32 s17, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s12, s9 -; GFX9-NEXT: s_cselect_b32 s12, s17, s13 -; GFX9-NEXT: s_add_u32 s13, s14, 1 -; GFX9-NEXT: s_addc_u32 s17, s15, 0 -; GFX9-NEXT: s_add_u32 s18, s14, 2 -; GFX9-NEXT: s_addc_u32 s19, s15, 0 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 -; GFX9-NEXT: s_cselect_b32 s12, s18, s13 -; GFX9-NEXT: s_cselect_b32 s13, s19, s17 +; GFX9-NEXT: s_cmp_ge_u32 s16, s8 +; GFX9-NEXT: s_cselect_b32 s16, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s15, s9 +; GFX9-NEXT: s_cselect_b32 s15, s16, s17 +; GFX9-NEXT: s_add_u32 s16, s13, 1 +; GFX9-NEXT: s_addc_u32 s17, s12, 0 +; GFX9-NEXT: s_add_u32 s18, s13, 2 +; GFX9-NEXT: s_addc_u32 s19, s12, 0 +; GFX9-NEXT: s_cmp_lg_u32 s15, 0 +; GFX9-NEXT: s_cselect_b32 s15, s18, s16 +; GFX9-NEXT: s_cselect_b32 s16, s19, s17 ; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s3, s3, s16 +; GFX9-NEXT: s_subb_u32 s3, s3, s14 ; GFX9-NEXT: s_cmp_ge_u32 s3, s9 ; GFX9-NEXT: s_cselect_b32 s10, -1, 0 ; GFX9-NEXT: s_cmp_ge_u32 s2, s8 @@ -8077,8 +8056,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_cmp_eq_u32 s3, s9 ; GFX9-NEXT: s_cselect_b32 s2, s2, s10 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s3, s13, s15 -; GFX9-NEXT: s_cselect_b32 s2, s12, s14 +; GFX9-NEXT: s_cselect_b32 s3, s16, s12 +; GFX9-NEXT: s_cselect_b32 s2, s15, s13 ; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] ; GFX9-NEXT: s_sub_u32 s2, s2, s4 @@ -8338,10 +8317,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_addc_u32 s17, 0, s18 ; GFX6-NEXT: s_add_u32 s18, s12, s13 ; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_addc_u32 s16, s16, s17 ; GFX6-NEXT: s_mul_i32 s12, s14, s16 ; GFX6-NEXT: v_readfirstlane_b32 s13, v0 @@ -8372,7 +8350,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_add_u32 s15, s18, s12 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_addc_u32 s14, s16, s14 ; GFX6-NEXT: s_ashr_i32 s12, s9, 31 ; GFX6-NEXT: s_add_u32 s8, s8, s12 @@ -8397,55 +8374,53 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 ; GFX6-NEXT: s_addc_u32 s16, s16, 0 ; GFX6-NEXT: s_mul_i32 s14, s9, s14 -; GFX6-NEXT: s_add_u32 s17, s15, s14 -; GFX6-NEXT: v_mov_b32_e32 v0, s17 +; GFX6-NEXT: s_add_u32 s18, s15, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s18 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX6-NEXT: s_addc_u32 s16, 0, s16 -; GFX6-NEXT: s_mul_i32 s14, s6, s16 +; GFX6-NEXT: s_addc_u32 s19, 0, s16 +; GFX6-NEXT: s_mul_i32 s14, s6, s19 ; GFX6-NEXT: v_readfirstlane_b32 s15, v0 ; GFX6-NEXT: s_add_i32 s14, s15, s14 -; GFX6-NEXT: s_mul_i32 s15, s7, s17 -; GFX6-NEXT: s_add_i32 s18, s14, s15 -; GFX6-NEXT: s_sub_i32 s19, s9, s18 -; GFX6-NEXT: s_mul_i32 s14, s6, s17 +; GFX6-NEXT: s_mul_i32 s15, s7, s18 +; GFX6-NEXT: s_add_i32 s20, s14, s15 +; GFX6-NEXT: s_sub_i32 s16, s9, s20 +; GFX6-NEXT: s_mul_i32 s14, s6, s18 ; GFX6-NEXT: s_sub_u32 s8, s8, s14 ; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s20, s14, s15 -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_subb_u32 s19, s19, s7 -; GFX6-NEXT: s_sub_u32 s21, s8, s6 -; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX6-NEXT: s_or_b32 s17, s14, s15 +; GFX6-NEXT: s_subb_u32 s21, s16, s7 +; GFX6-NEXT: s_sub_u32 s22, s8, s6 +; GFX6-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GFX6-NEXT: s_or_b32 s16, s16, s17 +; GFX6-NEXT: s_subb_u32 s16, s21, 0 +; GFX6-NEXT: s_cmp_ge_u32 s16, s7 +; GFX6-NEXT: s_cselect_b32 s17, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s22, s6 +; GFX6-NEXT: s_cselect_b32 s21, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s16, s7 +; GFX6-NEXT: s_cselect_b32 s16, s21, s17 +; GFX6-NEXT: s_add_u32 s17, s18, 1 +; GFX6-NEXT: s_addc_u32 s21, s19, 0 +; GFX6-NEXT: s_add_u32 s22, s18, 2 +; GFX6-NEXT: s_addc_u32 s23, s19, 0 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_cselect_b32 s16, s22, s17 +; GFX6-NEXT: s_cselect_b32 s17, s23, s21 ; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_cmp_lg_u32 s14, 0 -; GFX6-NEXT: s_subb_u32 s14, s19, 0 -; GFX6-NEXT: s_cmp_ge_u32 s14, s7 -; GFX6-NEXT: s_cselect_b32 s15, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s21, s6 -; GFX6-NEXT: s_cselect_b32 s19, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s14, s7 -; GFX6-NEXT: s_cselect_b32 s14, s19, s15 -; GFX6-NEXT: s_add_u32 s15, s17, 1 -; GFX6-NEXT: s_addc_u32 s19, s16, 0 -; GFX6-NEXT: s_add_u32 s21, s17, 2 -; GFX6-NEXT: s_addc_u32 s22, s16, 0 -; GFX6-NEXT: s_cmp_lg_u32 s14, 0 -; GFX6-NEXT: s_cselect_b32 s14, s21, s15 -; GFX6-NEXT: s_cselect_b32 s15, s22, s19 -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_subb_u32 s9, s9, s18 +; GFX6-NEXT: s_subb_u32 s9, s9, s20 ; GFX6-NEXT: s_cmp_ge_u32 s9, s7 -; GFX6-NEXT: s_cselect_b32 s18, -1, 0 +; GFX6-NEXT: s_cselect_b32 s14, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s8, s6 ; GFX6-NEXT: s_cselect_b32 s6, -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s9, s7 -; GFX6-NEXT: s_cselect_b32 s6, s6, s18 +; GFX6-NEXT: s_cselect_b32 s6, s6, s14 ; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_cselect_b32 s7, s15, s16 -; GFX6-NEXT: s_cselect_b32 s6, s14, s17 +; GFX6-NEXT: s_cselect_b32 s7, s17, s19 +; GFX6-NEXT: s_cselect_b32 s6, s16, s18 ; GFX6-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3] ; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] -; GFX6-NEXT: s_sub_u32 s14, s6, s2 -; GFX6-NEXT: s_subb_u32 s15, s7, s3 +; GFX6-NEXT: s_sub_u32 s16, s6, s2 +; GFX6-NEXT: s_subb_u32 s17, s7, s3 ; GFX6-NEXT: s_ashr_i32 s6, s1, 31 ; GFX6-NEXT: s_add_u32 s0, s0, s6 ; GFX6-NEXT: s_mov_b32 s7, s6 @@ -8464,40 +8439,39 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0 -; GFX6-NEXT: v_readfirstlane_b32 s16, v1 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s1, s12, s16 +; GFX6-NEXT: s_mul_i32 s1, s12, s14 ; GFX6-NEXT: v_readfirstlane_b32 s3, v2 ; GFX6-NEXT: s_mul_i32 s0, s13, s2 ; GFX6-NEXT: s_add_i32 s1, s3, s1 ; GFX6-NEXT: s_add_i32 s3, s1, s0 -; GFX6-NEXT: s_mul_i32 s17, s12, s2 +; GFX6-NEXT: s_mul_i32 s15, s12, s2 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s17 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mul_i32 s4, s2, s3 ; GFX6-NEXT: v_readfirstlane_b32 s5, v2 ; GFX6-NEXT: v_readfirstlane_b32 s18, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, v1, s17 +; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15 ; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3 ; GFX6-NEXT: s_add_u32 s4, s18, s4 ; GFX6-NEXT: s_addc_u32 s5, 0, s5 -; GFX6-NEXT: s_mul_i32 s17, s16, s17 +; GFX6-NEXT: s_mul_i32 s15, s14, s15 ; GFX6-NEXT: v_readfirstlane_b32 s18, v0 -; GFX6-NEXT: s_add_u32 s4, s4, s17 +; GFX6-NEXT: s_add_u32 s4, s4, s15 ; GFX6-NEXT: s_addc_u32 s4, s5, s18 ; GFX6-NEXT: v_readfirstlane_b32 s5, v1 ; GFX6-NEXT: s_addc_u32 s5, s5, 0 -; GFX6-NEXT: s_mul_i32 s3, s16, s3 +; GFX6-NEXT: s_mul_i32 s3, s14, s3 ; GFX6-NEXT: s_add_u32 s3, s4, s3 ; GFX6-NEXT: s_addc_u32 s4, 0, s5 ; GFX6-NEXT: s_add_u32 s5, s2, s3 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_addc_u32 s4, s16, s4 +; GFX6-NEXT: s_addc_u32 s4, s14, s4 ; GFX6-NEXT: s_mul_i32 s2, s12, s4 ; GFX6-NEXT: v_readfirstlane_b32 s3, v0 ; GFX6-NEXT: s_add_i32 s2, s3, s2 @@ -8511,14 +8485,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: s_mul_i32 s13, s5, s2 -; GFX6-NEXT: v_readfirstlane_b32 s17, v2 -; GFX6-NEXT: s_add_u32 s13, s17, s13 -; GFX6-NEXT: v_readfirstlane_b32 s16, v0 +; GFX6-NEXT: v_readfirstlane_b32 s15, v2 +; GFX6-NEXT: s_add_u32 s13, s15, s13 +; GFX6-NEXT: v_readfirstlane_b32 s14, v0 ; GFX6-NEXT: s_mul_i32 s3, s4, s3 -; GFX6-NEXT: s_addc_u32 s16, 0, s16 +; GFX6-NEXT: s_addc_u32 s14, 0, s14 ; GFX6-NEXT: v_readfirstlane_b32 s12, v3 ; GFX6-NEXT: s_add_u32 s3, s13, s3 -; GFX6-NEXT: s_addc_u32 s3, s16, s12 +; GFX6-NEXT: s_addc_u32 s3, s14, s12 ; GFX6-NEXT: v_readfirstlane_b32 s12, v1 ; GFX6-NEXT: s_addc_u32 s12, s12, 0 ; GFX6-NEXT: s_mul_i32 s2, s4, s2 @@ -8527,7 +8501,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_add_u32 s13, s5, s2 ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: s_addc_u32 s12, s4, s12 ; GFX6-NEXT: s_ashr_i32 s4, s11, 31 ; GFX6-NEXT: s_add_u32 s2, s10, s4 @@ -8539,72 +8512,70 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mov_b32_e32 v2, s13 ; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2 ; GFX6-NEXT: s_mul_i32 s2, s10, s12 -; GFX6-NEXT: v_readfirstlane_b32 s16, v1 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s11, v2 -; GFX6-NEXT: v_readfirstlane_b32 s17, v3 +; GFX6-NEXT: v_readfirstlane_b32 s15, v3 ; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 -; GFX6-NEXT: s_add_u32 s2, s17, s2 -; GFX6-NEXT: s_addc_u32 s16, 0, s16 +; GFX6-NEXT: s_add_u32 s2, s15, s2 +; GFX6-NEXT: s_addc_u32 s14, 0, s14 ; GFX6-NEXT: s_mul_i32 s13, s11, s13 -; GFX6-NEXT: v_readfirstlane_b32 s17, v1 +; GFX6-NEXT: v_readfirstlane_b32 s15, v1 ; GFX6-NEXT: s_add_u32 s2, s2, s13 -; GFX6-NEXT: s_addc_u32 s2, s16, s17 +; GFX6-NEXT: s_addc_u32 s2, s14, s15 ; GFX6-NEXT: v_readfirstlane_b32 s13, v0 ; GFX6-NEXT: s_addc_u32 s13, s13, 0 ; GFX6-NEXT: s_mul_i32 s12, s11, s12 -; GFX6-NEXT: s_add_u32 s16, s2, s12 -; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: s_add_u32 s18, s2, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s18 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: s_addc_u32 s17, 0, s13 -; GFX6-NEXT: s_mul_i32 s12, s8, s17 +; GFX6-NEXT: s_addc_u32 s19, 0, s13 +; GFX6-NEXT: s_mul_i32 s12, s8, s19 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_readfirstlane_b32 s13, v0 ; GFX6-NEXT: s_add_i32 s12, s13, s12 -; GFX6-NEXT: s_mul_i32 s13, s9, s16 -; GFX6-NEXT: s_add_i32 s18, s12, s13 -; GFX6-NEXT: s_sub_i32 s19, s11, s18 -; GFX6-NEXT: s_mul_i32 s12, s8, s16 +; GFX6-NEXT: s_mul_i32 s13, s9, s18 +; GFX6-NEXT: s_add_i32 s20, s12, s13 +; GFX6-NEXT: s_sub_i32 s14, s11, s20 +; GFX6-NEXT: s_mul_i32 s12, s8, s18 ; GFX6-NEXT: s_sub_u32 s10, s10, s12 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s20, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_subb_u32 s19, s19, s9 -; GFX6-NEXT: s_sub_u32 s21, s10, s8 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s15, s12, s13 +; GFX6-NEXT: s_subb_u32 s21, s14, s9 +; GFX6-NEXT: s_sub_u32 s22, s10, s8 +; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX6-NEXT: s_or_b32 s14, s14, s15 +; GFX6-NEXT: s_subb_u32 s14, s21, 0 +; GFX6-NEXT: s_cmp_ge_u32 s14, s9 +; GFX6-NEXT: s_cselect_b32 s15, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s22, s8 +; GFX6-NEXT: s_cselect_b32 s21, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s14, s9 +; GFX6-NEXT: s_cselect_b32 s14, s21, s15 +; GFX6-NEXT: s_add_u32 s15, s18, 1 +; GFX6-NEXT: s_addc_u32 s21, s19, 0 +; GFX6-NEXT: s_add_u32 s22, s18, 2 +; GFX6-NEXT: s_addc_u32 s23, s19, 0 +; GFX6-NEXT: s_cmp_lg_u32 s14, 0 +; GFX6-NEXT: s_cselect_b32 s14, s22, s15 +; GFX6-NEXT: s_cselect_b32 s15, s23, s21 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_subb_u32 s12, s19, 0 -; GFX6-NEXT: s_cmp_ge_u32 s12, s9 -; GFX6-NEXT: s_cselect_b32 s13, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s21, s8 -; GFX6-NEXT: s_cselect_b32 s19, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s12, s9 -; GFX6-NEXT: s_cselect_b32 s12, s19, s13 -; GFX6-NEXT: s_add_u32 s13, s16, 1 -; GFX6-NEXT: s_addc_u32 s19, s17, 0 -; GFX6-NEXT: s_add_u32 s21, s16, 2 -; GFX6-NEXT: s_addc_u32 s22, s17, 0 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_cselect_b32 s12, s21, s13 -; GFX6-NEXT: s_cselect_b32 s13, s22, s19 -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_subb_u32 s11, s11, s18 +; GFX6-NEXT: s_subb_u32 s11, s11, s20 ; GFX6-NEXT: s_cmp_ge_u32 s11, s9 -; GFX6-NEXT: s_cselect_b32 s18, -1, 0 +; GFX6-NEXT: s_cselect_b32 s12, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s10, s8 ; GFX6-NEXT: s_cselect_b32 s8, -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s11, s9 -; GFX6-NEXT: s_cselect_b32 s8, s8, s18 +; GFX6-NEXT: s_cselect_b32 s8, s8, s12 ; GFX6-NEXT: s_cmp_lg_u32 s8, 0 -; GFX6-NEXT: s_cselect_b32 s9, s13, s17 -; GFX6-NEXT: s_cselect_b32 s8, s12, s16 +; GFX6-NEXT: s_cselect_b32 s9, s15, s19 +; GFX6-NEXT: s_cselect_b32 s8, s14, s18 ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX6-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5] ; GFX6-NEXT: s_sub_u32 s4, s6, s4 ; GFX6-NEXT: s_subb_u32 s5, s7, s5 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v0, s14 -; GFX6-NEXT: v_mov_b32_e32 v1, s15 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: v_mov_b32_e32 v1, s17 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8624,8 +8595,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_u32 s14, 0, s6 -; GFX9-NEXT: s_subb_u32 s15, 0, s7 +; GFX9-NEXT: s_sub_u32 s12, 0, s6 +; GFX9-NEXT: s_subb_u32 s13, 0, s7 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -8634,56 +8605,52 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s16, v1 -; GFX9-NEXT: v_readfirstlane_b32 s12, v0 -; GFX9-NEXT: s_mul_i32 s13, s14, s16 -; GFX9-NEXT: s_mul_hi_u32 s18, s14, s12 -; GFX9-NEXT: s_mul_i32 s17, s15, s12 -; GFX9-NEXT: s_add_i32 s13, s18, s13 -; GFX9-NEXT: s_mul_i32 s19, s14, s12 -; GFX9-NEXT: s_add_i32 s13, s13, s17 -; GFX9-NEXT: s_mul_hi_u32 s18, s12, s19 -; GFX9-NEXT: s_mul_i32 s20, s12, s13 -; GFX9-NEXT: s_mul_hi_u32 s17, s12, s13 +; GFX9-NEXT: v_readfirstlane_b32 s14, v1 +; GFX9-NEXT: v_readfirstlane_b32 s15, v0 +; GFX9-NEXT: s_mul_i32 s16, s12, s14 +; GFX9-NEXT: s_mul_hi_u32 s18, s12, s15 +; GFX9-NEXT: s_mul_i32 s17, s13, s15 +; GFX9-NEXT: s_add_i32 s16, s18, s16 +; GFX9-NEXT: s_mul_i32 s19, s12, s15 +; GFX9-NEXT: s_add_i32 s16, s16, s17 +; GFX9-NEXT: s_mul_hi_u32 s18, s15, s19 +; GFX9-NEXT: s_mul_i32 s20, s15, s16 +; GFX9-NEXT: s_mul_hi_u32 s17, s15, s16 ; GFX9-NEXT: s_add_u32 s18, s18, s20 ; GFX9-NEXT: s_addc_u32 s17, 0, s17 -; GFX9-NEXT: s_mul_hi_u32 s20, s16, s19 -; GFX9-NEXT: s_mul_i32 s19, s16, s19 +; GFX9-NEXT: s_mul_hi_u32 s20, s14, s19 +; GFX9-NEXT: s_mul_i32 s19, s14, s19 ; GFX9-NEXT: s_add_u32 s18, s18, s19 -; GFX9-NEXT: s_mul_hi_u32 s21, s16, s13 +; GFX9-NEXT: s_mul_hi_u32 s21, s14, s16 ; GFX9-NEXT: s_addc_u32 s17, s17, s20 ; GFX9-NEXT: s_addc_u32 s18, s21, 0 -; GFX9-NEXT: s_mul_i32 s13, s16, s13 -; GFX9-NEXT: s_add_u32 s13, s17, s13 +; GFX9-NEXT: s_mul_i32 s16, s14, s16 +; GFX9-NEXT: s_add_u32 s16, s17, s16 ; GFX9-NEXT: s_addc_u32 s17, 0, s18 -; GFX9-NEXT: s_add_u32 s18, s12, s13 -; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GFX9-NEXT: s_addc_u32 s16, s16, s17 -; GFX9-NEXT: s_mul_i32 s12, s14, s16 -; GFX9-NEXT: s_mul_hi_u32 s13, s14, s18 -; GFX9-NEXT: s_add_i32 s12, s13, s12 -; GFX9-NEXT: s_mul_i32 s15, s15, s18 -; GFX9-NEXT: s_add_i32 s12, s12, s15 -; GFX9-NEXT: s_mul_i32 s14, s14, s18 -; GFX9-NEXT: s_mul_hi_u32 s15, s16, s14 -; GFX9-NEXT: s_mul_i32 s17, s16, s14 -; GFX9-NEXT: s_mul_i32 s20, s18, s12 -; GFX9-NEXT: s_mul_hi_u32 s14, s18, s14 -; GFX9-NEXT: s_mul_hi_u32 s19, s18, s12 -; GFX9-NEXT: s_add_u32 s14, s14, s20 +; GFX9-NEXT: s_add_u32 s15, s15, s16 +; GFX9-NEXT: s_addc_u32 s14, s14, s17 +; GFX9-NEXT: s_mul_i32 s16, s12, s14 +; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15 +; GFX9-NEXT: s_add_i32 s16, s17, s16 +; GFX9-NEXT: s_mul_i32 s13, s13, s15 +; GFX9-NEXT: s_add_i32 s16, s16, s13 +; GFX9-NEXT: s_mul_i32 s12, s12, s15 +; GFX9-NEXT: s_mul_hi_u32 s17, s14, s12 +; GFX9-NEXT: s_mul_i32 s18, s14, s12 +; GFX9-NEXT: s_mul_i32 s20, s15, s16 +; GFX9-NEXT: s_mul_hi_u32 s12, s15, s12 +; GFX9-NEXT: s_mul_hi_u32 s19, s15, s16 +; GFX9-NEXT: s_add_u32 s12, s12, s20 ; GFX9-NEXT: s_addc_u32 s19, 0, s19 -; GFX9-NEXT: s_add_u32 s14, s14, s17 -; GFX9-NEXT: s_mul_hi_u32 s13, s16, s12 -; GFX9-NEXT: s_addc_u32 s14, s19, s15 +; GFX9-NEXT: s_add_u32 s12, s12, s18 +; GFX9-NEXT: s_mul_hi_u32 s13, s14, s16 +; GFX9-NEXT: s_addc_u32 s12, s19, s17 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NEXT: s_mul_i32 s12, s16, s12 -; GFX9-NEXT: s_add_u32 s12, s14, s12 -; GFX9-NEXT: s_addc_u32 s14, 0, s13 -; GFX9-NEXT: s_add_u32 s15, s18, s12 -; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GFX9-NEXT: s_addc_u32 s14, s16, s14 +; GFX9-NEXT: s_mul_i32 s16, s14, s16 +; GFX9-NEXT: s_add_u32 s12, s12, s16 +; GFX9-NEXT: s_addc_u32 s13, 0, s13 +; GFX9-NEXT: s_add_u32 s15, s15, s12 +; GFX9-NEXT: s_addc_u32 s14, s14, s13 ; GFX9-NEXT: s_ashr_i32 s12, s9, 31 ; GFX9-NEXT: s_add_u32 s8, s8, s12 ; GFX9-NEXT: s_mov_b32 s13, s12 @@ -8701,38 +8668,35 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_addc_u32 s15, s16, s19 ; GFX9-NEXT: s_addc_u32 s16, s18, 0 ; GFX9-NEXT: s_mul_i32 s14, s9, s14 -; GFX9-NEXT: s_add_u32 s18, s15, s14 -; GFX9-NEXT: s_addc_u32 s19, 0, s16 -; GFX9-NEXT: s_mul_i32 s14, s6, s19 -; GFX9-NEXT: s_mul_hi_u32 s15, s6, s18 +; GFX9-NEXT: s_add_u32 s17, s15, s14 +; GFX9-NEXT: s_addc_u32 s16, 0, s16 +; GFX9-NEXT: s_mul_i32 s14, s6, s16 +; GFX9-NEXT: s_mul_hi_u32 s15, s6, s17 ; GFX9-NEXT: s_add_i32 s14, s15, s14 -; GFX9-NEXT: s_mul_i32 s15, s7, s18 -; GFX9-NEXT: s_add_i32 s20, s14, s15 -; GFX9-NEXT: s_sub_i32 s16, s9, s20 -; GFX9-NEXT: s_mul_i32 s14, s6, s18 +; GFX9-NEXT: s_mul_i32 s15, s7, s17 +; GFX9-NEXT: s_add_i32 s18, s14, s15 +; GFX9-NEXT: s_sub_i32 s19, s9, s18 +; GFX9-NEXT: s_mul_i32 s14, s6, s17 ; GFX9-NEXT: s_sub_u32 s8, s8, s14 ; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GFX9-NEXT: s_subb_u32 s21, s16, s7 -; GFX9-NEXT: s_sub_u32 s22, s8, s6 -; GFX9-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[16:17], 0 -; GFX9-NEXT: s_subb_u32 s16, s21, 0 -; GFX9-NEXT: s_cmp_ge_u32 s16, s7 -; GFX9-NEXT: s_cselect_b32 s17, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s22, s6 +; GFX9-NEXT: s_subb_u32 s19, s19, s7 +; GFX9-NEXT: s_sub_u32 s20, s8, s6 +; GFX9-NEXT: s_subb_u32 s19, s19, 0 +; GFX9-NEXT: s_cmp_ge_u32 s19, s7 ; GFX9-NEXT: s_cselect_b32 s21, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s16, s7 -; GFX9-NEXT: s_cselect_b32 s16, s21, s17 -; GFX9-NEXT: s_add_u32 s17, s18, 1 -; GFX9-NEXT: s_addc_u32 s21, s19, 0 -; GFX9-NEXT: s_add_u32 s22, s18, 2 -; GFX9-NEXT: s_addc_u32 s23, s19, 0 -; GFX9-NEXT: s_cmp_lg_u32 s16, 0 -; GFX9-NEXT: s_cselect_b32 s16, s22, s17 -; GFX9-NEXT: s_cselect_b32 s17, s23, s21 +; GFX9-NEXT: s_cmp_ge_u32 s20, s6 +; GFX9-NEXT: s_cselect_b32 s20, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s19, s7 +; GFX9-NEXT: s_cselect_b32 s19, s20, s21 +; GFX9-NEXT: s_add_u32 s20, s17, 1 +; GFX9-NEXT: s_addc_u32 s21, s16, 0 +; GFX9-NEXT: s_add_u32 s22, s17, 2 +; GFX9-NEXT: s_addc_u32 s23, s16, 0 +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cselect_b32 s19, s22, s20 +; GFX9-NEXT: s_cselect_b32 s20, s23, s21 ; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GFX9-NEXT: s_subb_u32 s9, s9, s20 +; GFX9-NEXT: s_subb_u32 s9, s9, s18 ; GFX9-NEXT: s_cmp_ge_u32 s9, s7 ; GFX9-NEXT: s_cselect_b32 s14, -1, 0 ; GFX9-NEXT: s_cmp_ge_u32 s8, s6 @@ -8740,12 +8704,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cmp_eq_u32 s9, s7 ; GFX9-NEXT: s_cselect_b32 s6, s6, s14 ; GFX9-NEXT: s_cmp_lg_u32 s6, 0 -; GFX9-NEXT: s_cselect_b32 s7, s17, s19 -; GFX9-NEXT: s_cselect_b32 s6, s16, s18 +; GFX9-NEXT: s_cselect_b32 s7, s20, s16 +; GFX9-NEXT: s_cselect_b32 s6, s19, s17 ; GFX9-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3] ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] -; GFX9-NEXT: s_sub_u32 s14, s6, s2 -; GFX9-NEXT: s_subb_u32 s15, s7, s3 +; GFX9-NEXT: s_sub_u32 s12, s6, s2 +; GFX9-NEXT: s_subb_u32 s13, s7, s3 ; GFX9-NEXT: s_ashr_i32 s2, s1, 31 ; GFX9-NEXT: s_add_u32 s0, s0, s2 ; GFX9-NEXT: s_mov_b32 s3, s2 @@ -8754,8 +8718,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_sub_u32 s8, 0, s6 -; GFX9-NEXT: s_subb_u32 s9, 0, s7 +; GFX9-NEXT: s_sub_u32 s4, 0, s6 +; GFX9-NEXT: s_subb_u32 s5, 0, s7 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -8765,105 +8729,98 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: v_readfirstlane_b32 s13, v2 -; GFX9-NEXT: s_mul_hi_u32 s12, s8, s4 -; GFX9-NEXT: s_mul_i32 s16, s8, s13 -; GFX9-NEXT: s_mul_i32 s5, s9, s4 -; GFX9-NEXT: s_add_i32 s12, s12, s16 -; GFX9-NEXT: s_add_i32 s12, s12, s5 -; GFX9-NEXT: s_mul_i32 s17, s8, s4 -; GFX9-NEXT: s_mul_i32 s16, s4, s12 -; GFX9-NEXT: s_mul_hi_u32 s18, s4, s17 -; GFX9-NEXT: s_mul_hi_u32 s5, s4, s12 +; GFX9-NEXT: v_readfirstlane_b32 s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s15, v2 +; GFX9-NEXT: s_mul_hi_u32 s14, s4, s8 +; GFX9-NEXT: s_mul_i32 s16, s4, s15 +; GFX9-NEXT: s_mul_i32 s9, s5, s8 +; GFX9-NEXT: s_add_i32 s14, s14, s16 +; GFX9-NEXT: s_add_i32 s14, s14, s9 +; GFX9-NEXT: s_mul_i32 s17, s4, s8 +; GFX9-NEXT: s_mul_i32 s16, s8, s14 +; GFX9-NEXT: s_mul_hi_u32 s18, s8, s17 +; GFX9-NEXT: s_mul_hi_u32 s9, s8, s14 ; GFX9-NEXT: s_add_u32 s16, s18, s16 -; GFX9-NEXT: s_addc_u32 s5, 0, s5 -; GFX9-NEXT: s_mul_hi_u32 s19, s13, s17 -; GFX9-NEXT: s_mul_i32 s17, s13, s17 +; GFX9-NEXT: s_addc_u32 s9, 0, s9 +; GFX9-NEXT: s_mul_hi_u32 s19, s15, s17 +; GFX9-NEXT: s_mul_i32 s17, s15, s17 ; GFX9-NEXT: s_add_u32 s16, s16, s17 -; GFX9-NEXT: s_mul_hi_u32 s18, s13, s12 -; GFX9-NEXT: s_addc_u32 s5, s5, s19 +; GFX9-NEXT: s_mul_hi_u32 s18, s15, s14 +; GFX9-NEXT: s_addc_u32 s9, s9, s19 ; GFX9-NEXT: s_addc_u32 s16, s18, 0 -; GFX9-NEXT: s_mul_i32 s12, s13, s12 -; GFX9-NEXT: s_add_u32 s5, s5, s12 -; GFX9-NEXT: s_addc_u32 s12, 0, s16 -; GFX9-NEXT: s_add_u32 s16, s4, s5 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s12, s13, s12 -; GFX9-NEXT: s_mul_i32 s4, s8, s12 -; GFX9-NEXT: s_mul_hi_u32 s5, s8, s16 -; GFX9-NEXT: s_add_i32 s4, s5, s4 -; GFX9-NEXT: s_mul_i32 s9, s9, s16 -; GFX9-NEXT: s_add_i32 s4, s4, s9 -; GFX9-NEXT: s_mul_i32 s8, s8, s16 -; GFX9-NEXT: s_mul_hi_u32 s9, s12, s8 -; GFX9-NEXT: s_mul_i32 s13, s12, s8 -; GFX9-NEXT: s_mul_i32 s18, s16, s4 -; GFX9-NEXT: s_mul_hi_u32 s8, s16, s8 -; GFX9-NEXT: s_mul_hi_u32 s17, s16, s4 -; GFX9-NEXT: s_add_u32 s8, s8, s18 +; GFX9-NEXT: s_mul_i32 s14, s15, s14 +; GFX9-NEXT: s_add_u32 s9, s9, s14 +; GFX9-NEXT: s_addc_u32 s14, 0, s16 +; GFX9-NEXT: s_add_u32 s8, s8, s9 +; GFX9-NEXT: s_addc_u32 s9, s15, s14 +; GFX9-NEXT: s_mul_i32 s14, s4, s9 +; GFX9-NEXT: s_mul_hi_u32 s15, s4, s8 +; GFX9-NEXT: s_add_i32 s14, s15, s14 +; GFX9-NEXT: s_mul_i32 s5, s5, s8 +; GFX9-NEXT: s_add_i32 s14, s14, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s15, s9, s4 +; GFX9-NEXT: s_mul_i32 s16, s9, s4 +; GFX9-NEXT: s_mul_i32 s18, s8, s14 +; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 +; GFX9-NEXT: s_mul_hi_u32 s17, s8, s14 +; GFX9-NEXT: s_add_u32 s4, s4, s18 ; GFX9-NEXT: s_addc_u32 s17, 0, s17 -; GFX9-NEXT: s_add_u32 s8, s8, s13 -; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4 -; GFX9-NEXT: s_addc_u32 s8, s17, s9 +; GFX9-NEXT: s_add_u32 s4, s4, s16 +; GFX9-NEXT: s_mul_hi_u32 s5, s9, s14 +; GFX9-NEXT: s_addc_u32 s4, s17, s15 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_mul_i32 s4, s12, s4 -; GFX9-NEXT: s_add_u32 s4, s8, s4 -; GFX9-NEXT: s_addc_u32 s8, 0, s5 -; GFX9-NEXT: s_add_u32 s13, s16, s4 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s12, s12, s8 +; GFX9-NEXT: s_mul_i32 s14, s9, s14 +; GFX9-NEXT: s_add_u32 s4, s4, s14 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_add_u32 s14, s8, s4 +; GFX9-NEXT: s_addc_u32 s15, s9, s5 ; GFX9-NEXT: s_ashr_i32 s4, s11, 31 ; GFX9-NEXT: s_add_u32 s8, s10, s4 ; GFX9-NEXT: s_mov_b32 s5, s4 ; GFX9-NEXT: s_addc_u32 s9, s11, s4 ; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5] -; GFX9-NEXT: s_mul_i32 s11, s8, s12 -; GFX9-NEXT: s_mul_hi_u32 s16, s8, s13 -; GFX9-NEXT: s_mul_hi_u32 s10, s8, s12 +; GFX9-NEXT: s_mul_i32 s11, s8, s15 +; GFX9-NEXT: s_mul_hi_u32 s16, s8, s14 +; GFX9-NEXT: s_mul_hi_u32 s10, s8, s15 ; GFX9-NEXT: s_add_u32 s11, s16, s11 ; GFX9-NEXT: s_addc_u32 s10, 0, s10 -; GFX9-NEXT: s_mul_hi_u32 s17, s9, s13 -; GFX9-NEXT: s_mul_i32 s13, s9, s13 -; GFX9-NEXT: s_add_u32 s11, s11, s13 -; GFX9-NEXT: s_mul_hi_u32 s16, s9, s12 +; GFX9-NEXT: s_mul_hi_u32 s17, s9, s14 +; GFX9-NEXT: s_mul_i32 s14, s9, s14 +; GFX9-NEXT: s_add_u32 s11, s11, s14 +; GFX9-NEXT: s_mul_hi_u32 s16, s9, s15 ; GFX9-NEXT: s_addc_u32 s10, s10, s17 ; GFX9-NEXT: s_addc_u32 s11, s16, 0 -; GFX9-NEXT: s_mul_i32 s12, s9, s12 -; GFX9-NEXT: s_add_u32 s16, s10, s12 -; GFX9-NEXT: s_addc_u32 s17, 0, s11 -; GFX9-NEXT: s_mul_i32 s10, s6, s17 -; GFX9-NEXT: s_mul_hi_u32 s11, s6, s16 +; GFX9-NEXT: s_mul_i32 s14, s9, s15 +; GFX9-NEXT: s_add_u32 s14, s10, s14 +; GFX9-NEXT: s_addc_u32 s15, 0, s11 +; GFX9-NEXT: s_mul_i32 s10, s6, s15 +; GFX9-NEXT: s_mul_hi_u32 s11, s6, s14 ; GFX9-NEXT: s_add_i32 s10, s11, s10 -; GFX9-NEXT: s_mul_i32 s11, s7, s16 -; GFX9-NEXT: s_add_i32 s18, s10, s11 -; GFX9-NEXT: s_sub_i32 s12, s9, s18 -; GFX9-NEXT: s_mul_i32 s10, s6, s16 +; GFX9-NEXT: s_mul_i32 s11, s7, s14 +; GFX9-NEXT: s_add_i32 s16, s10, s11 +; GFX9-NEXT: s_sub_i32 s17, s9, s16 +; GFX9-NEXT: s_mul_i32 s10, s6, s14 ; GFX9-NEXT: s_sub_u32 s8, s8, s10 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s19, s12, s7 -; GFX9-NEXT: s_sub_u32 s20, s8, s6 -; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GFX9-NEXT: s_subb_u32 s12, s19, 0 -; GFX9-NEXT: s_cmp_ge_u32 s12, s7 -; GFX9-NEXT: s_cselect_b32 s13, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s20, s6 +; GFX9-NEXT: s_subb_u32 s17, s17, s7 +; GFX9-NEXT: s_sub_u32 s18, s8, s6 +; GFX9-NEXT: s_subb_u32 s17, s17, 0 +; GFX9-NEXT: s_cmp_ge_u32 s17, s7 ; GFX9-NEXT: s_cselect_b32 s19, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s12, s7 -; GFX9-NEXT: s_cselect_b32 s12, s19, s13 -; GFX9-NEXT: s_add_u32 s13, s16, 1 -; GFX9-NEXT: s_addc_u32 s19, s17, 0 -; GFX9-NEXT: s_add_u32 s20, s16, 2 -; GFX9-NEXT: s_addc_u32 s21, s17, 0 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 -; GFX9-NEXT: s_cselect_b32 s12, s20, s13 -; GFX9-NEXT: s_cselect_b32 s13, s21, s19 +; GFX9-NEXT: s_cmp_ge_u32 s18, s6 +; GFX9-NEXT: s_cselect_b32 s18, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s17, s7 +; GFX9-NEXT: s_cselect_b32 s17, s18, s19 +; GFX9-NEXT: s_add_u32 s18, s14, 1 +; GFX9-NEXT: s_addc_u32 s19, s15, 0 +; GFX9-NEXT: s_add_u32 s20, s14, 2 +; GFX9-NEXT: s_addc_u32 s21, s15, 0 +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cselect_b32 s17, s20, s18 +; GFX9-NEXT: s_cselect_b32 s18, s21, s19 ; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s9, s9, s18 +; GFX9-NEXT: s_subb_u32 s9, s9, s16 ; GFX9-NEXT: s_cmp_ge_u32 s9, s7 ; GFX9-NEXT: s_cselect_b32 s10, -1, 0 ; GFX9-NEXT: s_cmp_ge_u32 s8, s6 @@ -8871,14 +8828,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cmp_eq_u32 s9, s7 ; GFX9-NEXT: s_cselect_b32 s6, s6, s10 ; GFX9-NEXT: s_cmp_lg_u32 s6, 0 -; GFX9-NEXT: s_cselect_b32 s7, s13, s17 -; GFX9-NEXT: s_cselect_b32 s6, s12, s16 +; GFX9-NEXT: s_cselect_b32 s7, s18, s15 +; GFX9-NEXT: s_cselect_b32 s6, s17, s14 ; GFX9-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] ; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], s[2:3] ; GFX9-NEXT: s_sub_u32 s2, s4, s2 ; GFX9-NEXT: s_subb_u32 s3, s5, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -9099,10 +9056,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_addc_u32 s13, 0, s14 ; GFX6-NEXT: s_add_u32 s14, s0, s1 ; GFX6-NEXT: v_mov_b32_e32 v0, s14 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 +; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_addc_u32 s12, s12, s13 ; GFX6-NEXT: s_mul_i32 s0, s10, s12 ; GFX6-NEXT: v_readfirstlane_b32 s1, v0 @@ -9133,7 +9089,6 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_add_u32 s13, s14, s0 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_addc_u32 s12, s12, s10 ; GFX6-NEXT: s_ashr_i32 s10, s7, 31 ; GFX6-NEXT: s_add_u32 s0, s6, s10 @@ -9168,46 +9123,43 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: v_readfirstlane_b32 s5, v0 ; GFX6-NEXT: s_add_i32 s4, s5, s4 ; GFX6-NEXT: s_mul_i32 s5, s9, s12 -; GFX6-NEXT: s_add_i32 s13, s4, s5 -; GFX6-NEXT: s_sub_i32 s14, s7, s13 +; GFX6-NEXT: s_add_i32 s14, s4, s5 +; GFX6-NEXT: s_sub_i32 s13, s7, s14 ; GFX6-NEXT: s_mul_i32 s4, s8, s12 ; GFX6-NEXT: s_sub_u32 s6, s6, s4 ; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX6-NEXT: s_or_b32 s12, s4, s5 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_subb_u32 s14, s14, s9 -; GFX6-NEXT: s_sub_u32 s15, s6, s8 -; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX6-NEXT: s_subb_u32 s15, s13, s9 +; GFX6-NEXT: s_sub_u32 s16, s6, s8 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s17, s12, s13 +; GFX6-NEXT: s_subb_u32 s17, s15, 0 +; GFX6-NEXT: s_cmp_ge_u32 s17, s9 +; GFX6-NEXT: s_cselect_b32 s18, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s16, s8 +; GFX6-NEXT: s_cselect_b32 s19, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s17, s9 +; GFX6-NEXT: s_cselect_b32 s18, s19, s18 +; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_subb_u32 s15, s15, s9 +; GFX6-NEXT: s_sub_u32 s19, s16, s8 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_subb_u32 s12, s15, 0 +; GFX6-NEXT: s_cmp_lg_u32 s18, 0 +; GFX6-NEXT: s_cselect_b32 s13, s19, s16 +; GFX6-NEXT: s_cselect_b32 s12, s12, s17 ; GFX6-NEXT: s_or_b32 s4, s4, s5 -; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_subb_u32 s16, s14, 0 -; GFX6-NEXT: s_cmp_ge_u32 s16, s9 +; GFX6-NEXT: s_subb_u32 s4, s7, s14 +; GFX6-NEXT: s_cmp_ge_u32 s4, s9 ; GFX6-NEXT: s_cselect_b32 s5, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s15, s8 -; GFX6-NEXT: s_cselect_b32 s17, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s16, s9 -; GFX6-NEXT: s_cselect_b32 s17, s17, s5 -; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_subb_u32 s14, s14, s9 -; GFX6-NEXT: s_sub_u32 s18, s15, s8 -; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX6-NEXT: s_or_b32 s4, s4, s5 -; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_subb_u32 s4, s14, 0 -; GFX6-NEXT: s_cmp_lg_u32 s17, 0 -; GFX6-NEXT: s_cselect_b32 s14, s18, s15 -; GFX6-NEXT: s_cselect_b32 s4, s4, s16 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_subb_u32 s5, s7, s13 -; GFX6-NEXT: s_cmp_ge_u32 s5, s9 -; GFX6-NEXT: s_cselect_b32 s7, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s6, s8 -; GFX6-NEXT: s_cselect_b32 s8, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s5, s9 -; GFX6-NEXT: s_cselect_b32 s7, s8, s7 -; GFX6-NEXT: s_cmp_lg_u32 s7, 0 -; GFX6-NEXT: s_cselect_b32 s5, s4, s5 -; GFX6-NEXT: s_cselect_b32 s4, s14, s6 +; GFX6-NEXT: s_cselect_b32 s7, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s4, s9 +; GFX6-NEXT: s_cselect_b32 s5, s7, s5 +; GFX6-NEXT: s_cmp_lg_u32 s5, 0 +; GFX6-NEXT: s_cselect_b32 s5, s12, s4 +; GFX6-NEXT: s_cselect_b32 s4, s13, s6 ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11] ; GFX6-NEXT: s_sub_u32 s4, s4, s10 ; GFX6-NEXT: s_subb_u32 s5, s5, s10 @@ -9229,8 +9181,8 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_sub_u32 s8, 0, s6 -; GFX9-NEXT: s_subb_u32 s9, 0, s7 +; GFX9-NEXT: s_sub_u32 s4, 0, s6 +; GFX9-NEXT: s_subb_u32 s5, 0, s7 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -9240,56 +9192,52 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s10, v2 -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_mul_i32 s5, s8, s10 -; GFX9-NEXT: s_mul_hi_u32 s12, s8, s4 -; GFX9-NEXT: s_mul_i32 s11, s9, s4 -; GFX9-NEXT: s_add_i32 s5, s12, s5 -; GFX9-NEXT: s_mul_i32 s13, s8, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s11 -; GFX9-NEXT: s_mul_hi_u32 s12, s4, s13 -; GFX9-NEXT: s_mul_i32 s14, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s11, s4, s5 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v1 +; GFX9-NEXT: s_mul_i32 s10, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s12, s4, s9 +; GFX9-NEXT: s_mul_i32 s11, s5, s9 +; GFX9-NEXT: s_add_i32 s10, s12, s10 +; GFX9-NEXT: s_mul_i32 s13, s4, s9 +; GFX9-NEXT: s_add_i32 s10, s10, s11 +; GFX9-NEXT: s_mul_hi_u32 s12, s9, s13 +; GFX9-NEXT: s_mul_i32 s14, s9, s10 +; GFX9-NEXT: s_mul_hi_u32 s11, s9, s10 ; GFX9-NEXT: s_add_u32 s12, s12, s14 ; GFX9-NEXT: s_addc_u32 s11, 0, s11 -; GFX9-NEXT: s_mul_hi_u32 s15, s10, s13 -; GFX9-NEXT: s_mul_i32 s13, s10, s13 +; GFX9-NEXT: s_mul_hi_u32 s15, s8, s13 +; GFX9-NEXT: s_mul_i32 s13, s8, s13 ; GFX9-NEXT: s_add_u32 s12, s12, s13 -; GFX9-NEXT: s_mul_hi_u32 s14, s10, s5 +; GFX9-NEXT: s_mul_hi_u32 s14, s8, s10 ; GFX9-NEXT: s_addc_u32 s11, s11, s15 ; GFX9-NEXT: s_addc_u32 s12, s14, 0 -; GFX9-NEXT: s_mul_i32 s5, s10, s5 -; GFX9-NEXT: s_add_u32 s5, s11, s5 +; GFX9-NEXT: s_mul_i32 s10, s8, s10 +; GFX9-NEXT: s_add_u32 s10, s11, s10 ; GFX9-NEXT: s_addc_u32 s11, 0, s12 -; GFX9-NEXT: s_add_u32 s12, s4, s5 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s10, s10, s11 -; GFX9-NEXT: s_mul_i32 s4, s8, s10 -; GFX9-NEXT: s_mul_hi_u32 s5, s8, s12 -; GFX9-NEXT: s_add_i32 s4, s5, s4 -; GFX9-NEXT: s_mul_i32 s9, s9, s12 -; GFX9-NEXT: s_add_i32 s4, s4, s9 -; GFX9-NEXT: s_mul_i32 s8, s8, s12 -; GFX9-NEXT: s_mul_hi_u32 s9, s10, s8 -; GFX9-NEXT: s_mul_i32 s11, s10, s8 -; GFX9-NEXT: s_mul_i32 s14, s12, s4 -; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8 -; GFX9-NEXT: s_mul_hi_u32 s13, s12, s4 -; GFX9-NEXT: s_add_u32 s8, s8, s14 +; GFX9-NEXT: s_add_u32 s9, s9, s10 +; GFX9-NEXT: s_addc_u32 s8, s8, s11 +; GFX9-NEXT: s_mul_i32 s10, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s11, s4, s9 +; GFX9-NEXT: s_add_i32 s10, s11, s10 +; GFX9-NEXT: s_mul_i32 s5, s5, s9 +; GFX9-NEXT: s_add_i32 s10, s10, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s9 +; GFX9-NEXT: s_mul_hi_u32 s11, s8, s4 +; GFX9-NEXT: s_mul_i32 s12, s8, s4 +; GFX9-NEXT: s_mul_i32 s14, s9, s10 +; GFX9-NEXT: s_mul_hi_u32 s4, s9, s4 +; GFX9-NEXT: s_mul_hi_u32 s13, s9, s10 +; GFX9-NEXT: s_add_u32 s4, s4, s14 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_add_u32 s8, s8, s11 -; GFX9-NEXT: s_mul_hi_u32 s5, s10, s4 -; GFX9-NEXT: s_addc_u32 s8, s13, s9 +; GFX9-NEXT: s_add_u32 s4, s4, s12 +; GFX9-NEXT: s_mul_hi_u32 s5, s8, s10 +; GFX9-NEXT: s_addc_u32 s4, s13, s11 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_mul_i32 s4, s10, s4 -; GFX9-NEXT: s_add_u32 s4, s8, s4 -; GFX9-NEXT: s_addc_u32 s8, 0, s5 -; GFX9-NEXT: s_add_u32 s9, s12, s4 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s8, s10, s8 +; GFX9-NEXT: s_mul_i32 s10, s8, s10 +; GFX9-NEXT: s_add_u32 s4, s4, s10 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_add_u32 s9, s9, s4 +; GFX9-NEXT: s_addc_u32 s8, s8, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: s_add_u32 s2, s2, s4 @@ -9319,11 +9267,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_mul_i32 s8, s6, s8 ; GFX9-NEXT: s_sub_u32 s2, s2, s8 ; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX9-NEXT: s_subb_u32 s13, s10, s7 ; GFX9-NEXT: s_sub_u32 s14, s2, s6 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 ; GFX9-NEXT: s_subb_u32 s15, s13, 0 ; GFX9-NEXT: s_cmp_ge_u32 s15, s7 ; GFX9-NEXT: s_cselect_b32 s16, -1, 0 @@ -9332,13 +9278,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_cmp_eq_u32 s15, s7 ; GFX9-NEXT: s_cselect_b32 s16, s17, s16 ; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s13, s13, s7 -; GFX9-NEXT: s_sub_u32 s17, s14, s6 -; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s10, s13, 0 +; GFX9-NEXT: s_subb_u32 s10, s13, s7 +; GFX9-NEXT: s_sub_u32 s11, s14, s6 +; GFX9-NEXT: s_subb_u32 s10, s10, 0 ; GFX9-NEXT: s_cmp_lg_u32 s16, 0 -; GFX9-NEXT: s_cselect_b32 s11, s17, s14 +; GFX9-NEXT: s_cselect_b32 s11, s11, s14 ; GFX9-NEXT: s_cselect_b32 s10, s10, s15 ; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX9-NEXT: s_subb_u32 s3, s3, s12 @@ -9500,10 +9444,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_addc_u32 s15, 0, s16 ; GFX6-NEXT: s_add_u32 s16, s6, s7 ; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX6-NEXT: s_or_b32 s6, s6, s7 -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 ; GFX6-NEXT: s_addc_u32 s14, s14, s15 ; GFX6-NEXT: s_mul_i32 s6, s12, s14 ; GFX6-NEXT: v_readfirstlane_b32 s7, v0 @@ -9534,7 +9477,6 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_add_u32 s13, s16, s6 ; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX6-NEXT: s_or_b32 s6, s6, s7 -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 ; GFX6-NEXT: s_addc_u32 s12, s14, s12 ; GFX6-NEXT: s_ashr_i32 s6, s9, 31 ; GFX6-NEXT: s_add_u32 s8, s8, s6 @@ -9567,49 +9509,46 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_readfirstlane_b32 s14, v0 ; GFX6-NEXT: s_add_i32 s13, s14, s13 ; GFX6-NEXT: s_mul_i32 s14, s3, s12 -; GFX6-NEXT: s_add_i32 s14, s13, s14 -; GFX6-NEXT: s_sub_i32 s15, s9, s14 +; GFX6-NEXT: s_add_i32 s16, s13, s14 +; GFX6-NEXT: s_sub_i32 s14, s9, s16 ; GFX6-NEXT: s_mul_i32 s12, s2, s12 ; GFX6-NEXT: s_sub_u32 s8, s8, s12 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s16, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_subb_u32 s15, s15, s3 -; GFX6-NEXT: s_sub_u32 s17, s8, s2 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_subb_u32 s18, s15, 0 -; GFX6-NEXT: s_cmp_ge_u32 s18, s3 -; GFX6-NEXT: s_cselect_b32 s13, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s17, s2 -; GFX6-NEXT: s_cselect_b32 s19, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s18, s3 -; GFX6-NEXT: s_cselect_b32 s19, s19, s13 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_subb_u32 s15, s15, s3 -; GFX6-NEXT: s_sub_u32 s20, s17, s2 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s15, s12, s13 +; GFX6-NEXT: s_subb_u32 s17, s14, s3 +; GFX6-NEXT: s_sub_u32 s18, s8, s2 +; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX6-NEXT: s_or_b32 s19, s14, s15 +; GFX6-NEXT: s_subb_u32 s19, s17, 0 +; GFX6-NEXT: s_cmp_ge_u32 s19, s3 +; GFX6-NEXT: s_cselect_b32 s20, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s18, s2 +; GFX6-NEXT: s_cselect_b32 s21, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s19, s3 +; GFX6-NEXT: s_cselect_b32 s20, s21, s20 +; GFX6-NEXT: s_or_b32 s14, s14, s15 +; GFX6-NEXT: s_subb_u32 s17, s17, s3 +; GFX6-NEXT: s_sub_u32 s21, s18, s2 +; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX6-NEXT: s_or_b32 s14, s14, s15 +; GFX6-NEXT: s_subb_u32 s14, s17, 0 +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_cselect_b32 s15, s21, s18 +; GFX6-NEXT: s_cselect_b32 s14, s14, s19 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_subb_u32 s12, s15, 0 -; GFX6-NEXT: s_cmp_lg_u32 s19, 0 -; GFX6-NEXT: s_cselect_b32 s13, s20, s17 -; GFX6-NEXT: s_cselect_b32 s12, s12, s18 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_subb_u32 s9, s9, s14 +; GFX6-NEXT: s_subb_u32 s9, s9, s16 ; GFX6-NEXT: s_cmp_ge_u32 s9, s3 -; GFX6-NEXT: s_cselect_b32 s14, -1, 0 +; GFX6-NEXT: s_cselect_b32 s12, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s8, s2 ; GFX6-NEXT: s_cselect_b32 s2, -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s9, s3 -; GFX6-NEXT: s_cselect_b32 s2, s2, s14 +; GFX6-NEXT: s_cselect_b32 s2, s2, s12 ; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_cselect_b32 s3, s12, s9 -; GFX6-NEXT: s_cselect_b32 s2, s13, s8 +; GFX6-NEXT: s_cselect_b32 s3, s14, s9 +; GFX6-NEXT: s_cselect_b32 s2, s15, s8 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] -; GFX6-NEXT: s_sub_u32 s12, s2, s6 -; GFX6-NEXT: s_subb_u32 s13, s3, s6 +; GFX6-NEXT: s_sub_u32 s14, s2, s6 +; GFX6-NEXT: s_subb_u32 s15, s3, s6 ; GFX6-NEXT: s_ashr_i32 s2, s1, 31 ; GFX6-NEXT: s_add_u32 s0, s0, s2 ; GFX6-NEXT: s_mov_b32 s3, s2 @@ -9628,40 +9567,39 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: v_readfirstlane_b32 s12, v1 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s1, s8, s14 +; GFX6-NEXT: s_mul_i32 s1, s8, s12 ; GFX6-NEXT: v_readfirstlane_b32 s3, v2 ; GFX6-NEXT: s_mul_i32 s0, s9, s2 ; GFX6-NEXT: s_add_i32 s1, s3, s1 ; GFX6-NEXT: s_add_i32 s3, s1, s0 -; GFX6-NEXT: s_mul_i32 s15, s8, s2 +; GFX6-NEXT: s_mul_i32 s13, s8, s2 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mul_i32 s4, s2, s3 ; GFX6-NEXT: v_readfirstlane_b32 s5, v2 ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15 +; GFX6-NEXT: v_mul_hi_u32 v0, v1, s13 ; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3 ; GFX6-NEXT: s_add_u32 s4, s16, s4 ; GFX6-NEXT: s_addc_u32 s5, 0, s5 -; GFX6-NEXT: s_mul_i32 s15, s14, s15 +; GFX6-NEXT: s_mul_i32 s13, s12, s13 ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 -; GFX6-NEXT: s_add_u32 s4, s4, s15 +; GFX6-NEXT: s_add_u32 s4, s4, s13 ; GFX6-NEXT: s_addc_u32 s4, s5, s16 ; GFX6-NEXT: v_readfirstlane_b32 s5, v1 ; GFX6-NEXT: s_addc_u32 s5, s5, 0 -; GFX6-NEXT: s_mul_i32 s3, s14, s3 +; GFX6-NEXT: s_mul_i32 s3, s12, s3 ; GFX6-NEXT: s_add_u32 s3, s4, s3 ; GFX6-NEXT: s_addc_u32 s4, 0, s5 ; GFX6-NEXT: s_add_u32 s5, s2, s3 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_addc_u32 s4, s14, s4 +; GFX6-NEXT: s_addc_u32 s4, s12, s4 ; GFX6-NEXT: s_mul_i32 s2, s8, s4 ; GFX6-NEXT: v_readfirstlane_b32 s3, v0 ; GFX6-NEXT: s_add_i32 s2, s3, s2 @@ -9675,102 +9613,98 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: s_mul_i32 s9, s5, s2 -; GFX6-NEXT: v_readfirstlane_b32 s15, v2 -; GFX6-NEXT: s_add_u32 s9, s15, s9 -; GFX6-NEXT: v_readfirstlane_b32 s14, v0 +; GFX6-NEXT: v_readfirstlane_b32 s13, v2 +; GFX6-NEXT: s_add_u32 s9, s13, s9 +; GFX6-NEXT: v_readfirstlane_b32 s12, v0 ; GFX6-NEXT: s_mul_i32 s3, s4, s3 -; GFX6-NEXT: s_addc_u32 s14, 0, s14 +; GFX6-NEXT: s_addc_u32 s12, 0, s12 ; GFX6-NEXT: v_readfirstlane_b32 s8, v3 ; GFX6-NEXT: s_add_u32 s3, s9, s3 -; GFX6-NEXT: s_addc_u32 s3, s14, s8 +; GFX6-NEXT: s_addc_u32 s3, s12, s8 ; GFX6-NEXT: v_readfirstlane_b32 s8, v1 ; GFX6-NEXT: s_addc_u32 s8, s8, 0 ; GFX6-NEXT: s_mul_i32 s2, s4, s2 ; GFX6-NEXT: s_add_u32 s2, s3, s2 ; GFX6-NEXT: s_addc_u32 s8, 0, s8 -; GFX6-NEXT: s_add_u32 s14, s5, s2 +; GFX6-NEXT: s_add_u32 s12, s5, s2 ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_addc_u32 s15, s4, s8 +; GFX6-NEXT: s_addc_u32 s13, s4, s8 ; GFX6-NEXT: s_ashr_i32 s4, s11, 31 ; GFX6-NEXT: s_add_u32 s2, s10, s4 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_addc_u32 s3, s11, s4 ; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, s15 +; GFX6-NEXT: v_mov_b32_e32 v0, s13 ; GFX6-NEXT: v_mul_hi_u32 v1, s8, v0 -; GFX6-NEXT: v_mov_b32_e32 v2, s14 +; GFX6-NEXT: v_mov_b32_e32 v2, s12 ; GFX6-NEXT: v_mul_hi_u32 v3, s8, v2 -; GFX6-NEXT: s_mul_i32 s2, s8, s15 +; GFX6-NEXT: s_mul_i32 s2, s8, s13 ; GFX6-NEXT: v_readfirstlane_b32 s10, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s9, v2 ; GFX6-NEXT: v_readfirstlane_b32 s11, v3 ; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 ; GFX6-NEXT: s_add_u32 s2, s11, s2 ; GFX6-NEXT: s_addc_u32 s10, 0, s10 -; GFX6-NEXT: s_mul_i32 s11, s9, s14 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: s_mul_i32 s11, s9, s12 +; GFX6-NEXT: v_readfirstlane_b32 s12, v1 ; GFX6-NEXT: s_add_u32 s2, s2, s11 -; GFX6-NEXT: s_addc_u32 s2, s10, s14 +; GFX6-NEXT: s_addc_u32 s2, s10, s12 ; GFX6-NEXT: v_readfirstlane_b32 s10, v0 ; GFX6-NEXT: s_addc_u32 s10, s10, 0 -; GFX6-NEXT: s_mul_i32 s11, s9, s15 +; GFX6-NEXT: s_mul_i32 s11, s9, s13 ; GFX6-NEXT: s_add_u32 s11, s2, s11 ; GFX6-NEXT: v_mov_b32_e32 v0, s11 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX6-NEXT: s_addc_u32 s10, 0, s10 ; GFX6-NEXT: s_mul_i32 s10, s6, s10 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: v_readfirstlane_b32 s14, v0 -; GFX6-NEXT: s_add_i32 s10, s14, s10 -; GFX6-NEXT: s_mul_i32 s14, s7, s11 -; GFX6-NEXT: s_add_i32 s14, s10, s14 -; GFX6-NEXT: s_sub_i32 s15, s9, s14 +; GFX6-NEXT: v_readfirstlane_b32 s12, v0 +; GFX6-NEXT: s_add_i32 s10, s12, s10 +; GFX6-NEXT: s_mul_i32 s12, s7, s11 +; GFX6-NEXT: s_add_i32 s16, s10, s12 +; GFX6-NEXT: s_sub_i32 s12, s9, s16 ; GFX6-NEXT: s_mul_i32 s10, s6, s11 ; GFX6-NEXT: s_sub_u32 s8, s8, s10 ; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX6-NEXT: s_or_b32 s16, s10, s11 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_subb_u32 s15, s15, s7 -; GFX6-NEXT: s_sub_u32 s17, s8, s6 -; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX6-NEXT: s_or_b32 s10, s10, s11 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 -; GFX6-NEXT: s_subb_u32 s18, s15, 0 -; GFX6-NEXT: s_cmp_ge_u32 s18, s7 -; GFX6-NEXT: s_cselect_b32 s11, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s17, s6 -; GFX6-NEXT: s_cselect_b32 s19, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s18, s7 -; GFX6-NEXT: s_cselect_b32 s19, s19, s11 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 -; GFX6-NEXT: s_subb_u32 s15, s15, s7 -; GFX6-NEXT: s_sub_u32 s20, s17, s6 -; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX6-NEXT: s_or_b32 s13, s10, s11 +; GFX6-NEXT: s_subb_u32 s17, s12, s7 +; GFX6-NEXT: s_sub_u32 s18, s8, s6 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s19, s12, s13 +; GFX6-NEXT: s_subb_u32 s19, s17, 0 +; GFX6-NEXT: s_cmp_ge_u32 s19, s7 +; GFX6-NEXT: s_cselect_b32 s20, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s18, s6 +; GFX6-NEXT: s_cselect_b32 s21, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s19, s7 +; GFX6-NEXT: s_cselect_b32 s20, s21, s20 +; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_subb_u32 s17, s17, s7 +; GFX6-NEXT: s_sub_u32 s21, s18, s6 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_subb_u32 s12, s17, 0 +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_cselect_b32 s13, s21, s18 +; GFX6-NEXT: s_cselect_b32 s12, s12, s19 ; GFX6-NEXT: s_or_b32 s10, s10, s11 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 -; GFX6-NEXT: s_subb_u32 s10, s15, 0 -; GFX6-NEXT: s_cmp_lg_u32 s19, 0 -; GFX6-NEXT: s_cselect_b32 s11, s20, s17 -; GFX6-NEXT: s_cselect_b32 s10, s10, s18 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_subb_u32 s9, s9, s14 +; GFX6-NEXT: s_subb_u32 s9, s9, s16 ; GFX6-NEXT: s_cmp_ge_u32 s9, s7 -; GFX6-NEXT: s_cselect_b32 s14, -1, 0 +; GFX6-NEXT: s_cselect_b32 s10, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s8, s6 ; GFX6-NEXT: s_cselect_b32 s6, -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s9, s7 -; GFX6-NEXT: s_cselect_b32 s6, s6, s14 +; GFX6-NEXT: s_cselect_b32 s6, s6, s10 ; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_cselect_b32 s7, s10, s9 -; GFX6-NEXT: s_cselect_b32 s6, s11, s8 +; GFX6-NEXT: s_cselect_b32 s7, s12, s9 +; GFX6-NEXT: s_cselect_b32 s6, s13, s8 ; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GFX6-NEXT: s_sub_u32 s5, s6, s4 ; GFX6-NEXT: s_subb_u32 s4, s7, s4 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v0, s12 -; GFX6-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: v_mov_b32_e32 v1, s15 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: v_mov_b32_e32 v3, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9790,8 +9724,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX9-NEXT: s_sub_u32 s12, 0, s2 -; GFX9-NEXT: s_subb_u32 s13, 0, s3 +; GFX9-NEXT: s_sub_u32 s6, 0, s2 +; GFX9-NEXT: s_subb_u32 s7, 0, s3 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -9800,56 +9734,52 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s14, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s7, s12, s14 -; GFX9-NEXT: s_mul_hi_u32 s16, s12, s6 -; GFX9-NEXT: s_mul_i32 s15, s13, s6 -; GFX9-NEXT: s_add_i32 s7, s16, s7 -; GFX9-NEXT: s_mul_i32 s17, s12, s6 -; GFX9-NEXT: s_add_i32 s7, s7, s15 -; GFX9-NEXT: s_mul_hi_u32 s16, s6, s17 -; GFX9-NEXT: s_mul_i32 s18, s6, s7 -; GFX9-NEXT: s_mul_hi_u32 s15, s6, s7 +; GFX9-NEXT: v_readfirstlane_b32 s12, v1 +; GFX9-NEXT: v_readfirstlane_b32 s13, v0 +; GFX9-NEXT: s_mul_i32 s14, s6, s12 +; GFX9-NEXT: s_mul_hi_u32 s16, s6, s13 +; GFX9-NEXT: s_mul_i32 s15, s7, s13 +; GFX9-NEXT: s_add_i32 s14, s16, s14 +; GFX9-NEXT: s_mul_i32 s17, s6, s13 +; GFX9-NEXT: s_add_i32 s14, s14, s15 +; GFX9-NEXT: s_mul_hi_u32 s16, s13, s17 +; GFX9-NEXT: s_mul_i32 s18, s13, s14 +; GFX9-NEXT: s_mul_hi_u32 s15, s13, s14 ; GFX9-NEXT: s_add_u32 s16, s16, s18 ; GFX9-NEXT: s_addc_u32 s15, 0, s15 -; GFX9-NEXT: s_mul_hi_u32 s18, s14, s17 -; GFX9-NEXT: s_mul_i32 s17, s14, s17 +; GFX9-NEXT: s_mul_hi_u32 s18, s12, s17 +; GFX9-NEXT: s_mul_i32 s17, s12, s17 ; GFX9-NEXT: s_add_u32 s16, s16, s17 -; GFX9-NEXT: s_mul_hi_u32 s19, s14, s7 +; GFX9-NEXT: s_mul_hi_u32 s19, s12, s14 ; GFX9-NEXT: s_addc_u32 s15, s15, s18 ; GFX9-NEXT: s_addc_u32 s16, s19, 0 -; GFX9-NEXT: s_mul_i32 s7, s14, s7 -; GFX9-NEXT: s_add_u32 s7, s15, s7 +; GFX9-NEXT: s_mul_i32 s14, s12, s14 +; GFX9-NEXT: s_add_u32 s14, s15, s14 ; GFX9-NEXT: s_addc_u32 s15, 0, s16 -; GFX9-NEXT: s_add_u32 s16, s6, s7 -; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9-NEXT: s_addc_u32 s14, s14, s15 -; GFX9-NEXT: s_mul_i32 s6, s12, s14 -; GFX9-NEXT: s_mul_hi_u32 s7, s12, s16 -; GFX9-NEXT: s_add_i32 s6, s7, s6 -; GFX9-NEXT: s_mul_i32 s13, s13, s16 -; GFX9-NEXT: s_add_i32 s6, s6, s13 -; GFX9-NEXT: s_mul_i32 s12, s12, s16 -; GFX9-NEXT: s_mul_hi_u32 s13, s14, s12 -; GFX9-NEXT: s_mul_i32 s15, s14, s12 -; GFX9-NEXT: s_mul_i32 s18, s16, s6 -; GFX9-NEXT: s_mul_hi_u32 s12, s16, s12 -; GFX9-NEXT: s_mul_hi_u32 s17, s16, s6 -; GFX9-NEXT: s_add_u32 s12, s12, s18 +; GFX9-NEXT: s_add_u32 s13, s13, s14 +; GFX9-NEXT: s_addc_u32 s12, s12, s15 +; GFX9-NEXT: s_mul_i32 s14, s6, s12 +; GFX9-NEXT: s_mul_hi_u32 s15, s6, s13 +; GFX9-NEXT: s_add_i32 s14, s15, s14 +; GFX9-NEXT: s_mul_i32 s7, s7, s13 +; GFX9-NEXT: s_add_i32 s14, s14, s7 +; GFX9-NEXT: s_mul_i32 s6, s6, s13 +; GFX9-NEXT: s_mul_hi_u32 s15, s12, s6 +; GFX9-NEXT: s_mul_i32 s16, s12, s6 +; GFX9-NEXT: s_mul_i32 s18, s13, s14 +; GFX9-NEXT: s_mul_hi_u32 s6, s13, s6 +; GFX9-NEXT: s_mul_hi_u32 s17, s13, s14 +; GFX9-NEXT: s_add_u32 s6, s6, s18 ; GFX9-NEXT: s_addc_u32 s17, 0, s17 -; GFX9-NEXT: s_add_u32 s12, s12, s15 -; GFX9-NEXT: s_mul_hi_u32 s7, s14, s6 -; GFX9-NEXT: s_addc_u32 s12, s17, s13 +; GFX9-NEXT: s_add_u32 s6, s6, s16 +; GFX9-NEXT: s_mul_hi_u32 s7, s12, s14 +; GFX9-NEXT: s_addc_u32 s6, s17, s15 ; GFX9-NEXT: s_addc_u32 s7, s7, 0 -; GFX9-NEXT: s_mul_i32 s6, s14, s6 -; GFX9-NEXT: s_add_u32 s6, s12, s6 -; GFX9-NEXT: s_addc_u32 s12, 0, s7 -; GFX9-NEXT: s_add_u32 s13, s16, s6 -; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9-NEXT: s_addc_u32 s12, s14, s12 +; GFX9-NEXT: s_mul_i32 s14, s12, s14 +; GFX9-NEXT: s_add_u32 s6, s6, s14 +; GFX9-NEXT: s_addc_u32 s7, 0, s7 +; GFX9-NEXT: s_add_u32 s13, s13, s6 +; GFX9-NEXT: s_addc_u32 s12, s12, s7 ; GFX9-NEXT: s_ashr_i32 s6, s9, 31 ; GFX9-NEXT: s_add_u32 s8, s8, s6 ; GFX9-NEXT: s_mov_b32 s7, s6 @@ -9878,11 +9808,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_i32 s12, s2, s12 ; GFX9-NEXT: s_sub_u32 s8, s8, s12 ; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 ; GFX9-NEXT: s_subb_u32 s17, s14, s3 ; GFX9-NEXT: s_sub_u32 s18, s8, s2 ; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 ; GFX9-NEXT: s_subb_u32 s19, s17, 0 ; GFX9-NEXT: s_cmp_ge_u32 s19, s3 ; GFX9-NEXT: s_cselect_b32 s20, -1, 0 @@ -9891,13 +9819,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cmp_eq_u32 s19, s3 ; GFX9-NEXT: s_cselect_b32 s20, s21, s20 ; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GFX9-NEXT: s_subb_u32 s17, s17, s3 -; GFX9-NEXT: s_sub_u32 s21, s18, s2 -; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GFX9-NEXT: s_subb_u32 s14, s17, 0 +; GFX9-NEXT: s_subb_u32 s14, s17, s3 +; GFX9-NEXT: s_sub_u32 s15, s18, s2 +; GFX9-NEXT: s_subb_u32 s14, s14, 0 ; GFX9-NEXT: s_cmp_lg_u32 s20, 0 -; GFX9-NEXT: s_cselect_b32 s15, s21, s18 +; GFX9-NEXT: s_cselect_b32 s15, s15, s18 ; GFX9-NEXT: s_cselect_b32 s14, s14, s19 ; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 ; GFX9-NEXT: s_subb_u32 s9, s9, s16 @@ -9921,8 +9847,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_sub_u32 s6, 0, s2 -; GFX9-NEXT: s_subb_u32 s7, 0, s3 +; GFX9-NEXT: s_sub_u32 s4, 0, s2 +; GFX9-NEXT: s_subb_u32 s5, 0, s3 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -9932,74 +9858,70 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v1 ; GFX9-NEXT: v_readfirstlane_b32 s9, v2 -; GFX9-NEXT: s_mul_hi_u32 s8, s6, s4 -; GFX9-NEXT: s_mul_i32 s14, s6, s9 -; GFX9-NEXT: s_mul_i32 s5, s7, s4 +; GFX9-NEXT: s_mul_hi_u32 s8, s4, s6 +; GFX9-NEXT: s_mul_i32 s14, s4, s9 +; GFX9-NEXT: s_mul_i32 s7, s5, s6 ; GFX9-NEXT: s_add_i32 s8, s8, s14 -; GFX9-NEXT: s_add_i32 s8, s8, s5 -; GFX9-NEXT: s_mul_i32 s15, s6, s4 -; GFX9-NEXT: s_mul_i32 s14, s4, s8 -; GFX9-NEXT: s_mul_hi_u32 s16, s4, s15 -; GFX9-NEXT: s_mul_hi_u32 s5, s4, s8 +; GFX9-NEXT: s_add_i32 s8, s8, s7 +; GFX9-NEXT: s_mul_i32 s15, s4, s6 +; GFX9-NEXT: s_mul_i32 s14, s6, s8 +; GFX9-NEXT: s_mul_hi_u32 s16, s6, s15 +; GFX9-NEXT: s_mul_hi_u32 s7, s6, s8 ; GFX9-NEXT: s_add_u32 s14, s16, s14 -; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_addc_u32 s7, 0, s7 ; GFX9-NEXT: s_mul_hi_u32 s17, s9, s15 ; GFX9-NEXT: s_mul_i32 s15, s9, s15 ; GFX9-NEXT: s_add_u32 s14, s14, s15 ; GFX9-NEXT: s_mul_hi_u32 s16, s9, s8 -; GFX9-NEXT: s_addc_u32 s5, s5, s17 +; GFX9-NEXT: s_addc_u32 s7, s7, s17 ; GFX9-NEXT: s_addc_u32 s14, s16, 0 ; GFX9-NEXT: s_mul_i32 s8, s9, s8 -; GFX9-NEXT: s_add_u32 s5, s5, s8 +; GFX9-NEXT: s_add_u32 s7, s7, s8 ; GFX9-NEXT: s_addc_u32 s8, 0, s14 -; GFX9-NEXT: s_add_u32 s14, s4, s5 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s8, s9, s8 -; GFX9-NEXT: s_mul_i32 s4, s6, s8 -; GFX9-NEXT: s_mul_hi_u32 s5, s6, s14 -; GFX9-NEXT: s_add_i32 s4, s5, s4 -; GFX9-NEXT: s_mul_i32 s7, s7, s14 -; GFX9-NEXT: s_add_i32 s4, s4, s7 -; GFX9-NEXT: s_mul_i32 s6, s6, s14 -; GFX9-NEXT: s_mul_hi_u32 s7, s8, s6 -; GFX9-NEXT: s_mul_i32 s9, s8, s6 -; GFX9-NEXT: s_mul_i32 s16, s14, s4 -; GFX9-NEXT: s_mul_hi_u32 s6, s14, s6 -; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4 -; GFX9-NEXT: s_add_u32 s6, s6, s16 +; GFX9-NEXT: s_add_u32 s6, s6, s7 +; GFX9-NEXT: s_addc_u32 s7, s9, s8 +; GFX9-NEXT: s_mul_i32 s8, s4, s7 +; GFX9-NEXT: s_mul_hi_u32 s9, s4, s6 +; GFX9-NEXT: s_add_i32 s8, s9, s8 +; GFX9-NEXT: s_mul_i32 s5, s5, s6 +; GFX9-NEXT: s_add_i32 s8, s8, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s6 +; GFX9-NEXT: s_mul_hi_u32 s9, s7, s4 +; GFX9-NEXT: s_mul_i32 s14, s7, s4 +; GFX9-NEXT: s_mul_i32 s16, s6, s8 +; GFX9-NEXT: s_mul_hi_u32 s4, s6, s4 +; GFX9-NEXT: s_mul_hi_u32 s15, s6, s8 +; GFX9-NEXT: s_add_u32 s4, s4, s16 ; GFX9-NEXT: s_addc_u32 s15, 0, s15 -; GFX9-NEXT: s_add_u32 s6, s6, s9 -; GFX9-NEXT: s_mul_hi_u32 s5, s8, s4 -; GFX9-NEXT: s_addc_u32 s6, s15, s7 +; GFX9-NEXT: s_add_u32 s4, s4, s14 +; GFX9-NEXT: s_mul_hi_u32 s5, s7, s8 +; GFX9-NEXT: s_addc_u32 s4, s15, s9 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_mul_i32 s4, s8, s4 -; GFX9-NEXT: s_add_u32 s4, s6, s4 -; GFX9-NEXT: s_addc_u32 s6, 0, s5 -; GFX9-NEXT: s_add_u32 s9, s14, s4 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s8, s8, s6 +; GFX9-NEXT: s_mul_i32 s8, s7, s8 +; GFX9-NEXT: s_add_u32 s4, s4, s8 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_add_u32 s8, s6, s4 +; GFX9-NEXT: s_addc_u32 s9, s7, s5 ; GFX9-NEXT: s_ashr_i32 s4, s11, 31 ; GFX9-NEXT: s_add_u32 s6, s10, s4 ; GFX9-NEXT: s_mov_b32 s5, s4 ; GFX9-NEXT: s_addc_u32 s7, s11, s4 ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] -; GFX9-NEXT: s_mul_i32 s11, s6, s8 -; GFX9-NEXT: s_mul_hi_u32 s14, s6, s9 -; GFX9-NEXT: s_mul_hi_u32 s10, s6, s8 +; GFX9-NEXT: s_mul_i32 s11, s6, s9 +; GFX9-NEXT: s_mul_hi_u32 s14, s6, s8 +; GFX9-NEXT: s_mul_hi_u32 s10, s6, s9 ; GFX9-NEXT: s_add_u32 s11, s14, s11 ; GFX9-NEXT: s_addc_u32 s10, 0, s10 -; GFX9-NEXT: s_mul_hi_u32 s15, s7, s9 -; GFX9-NEXT: s_mul_i32 s9, s7, s9 -; GFX9-NEXT: s_add_u32 s9, s11, s9 -; GFX9-NEXT: s_mul_hi_u32 s14, s7, s8 -; GFX9-NEXT: s_addc_u32 s9, s10, s15 -; GFX9-NEXT: s_addc_u32 s10, s14, 0 +; GFX9-NEXT: s_mul_hi_u32 s15, s7, s8 ; GFX9-NEXT: s_mul_i32 s8, s7, s8 -; GFX9-NEXT: s_add_u32 s8, s9, s8 +; GFX9-NEXT: s_add_u32 s8, s11, s8 +; GFX9-NEXT: s_mul_hi_u32 s14, s7, s9 +; GFX9-NEXT: s_addc_u32 s8, s10, s15 +; GFX9-NEXT: s_addc_u32 s10, s14, 0 +; GFX9-NEXT: s_mul_i32 s9, s7, s9 +; GFX9-NEXT: s_add_u32 s8, s8, s9 ; GFX9-NEXT: s_addc_u32 s9, 0, s10 ; GFX9-NEXT: s_mul_i32 s9, s2, s9 ; GFX9-NEXT: s_mul_hi_u32 s10, s2, s8 @@ -10010,11 +9932,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_i32 s8, s2, s8 ; GFX9-NEXT: s_sub_u32 s6, s6, s8 ; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX9-NEXT: s_subb_u32 s15, s10, s3 ; GFX9-NEXT: s_sub_u32 s16, s6, s2 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 ; GFX9-NEXT: s_subb_u32 s17, s15, 0 ; GFX9-NEXT: s_cmp_ge_u32 s17, s3 ; GFX9-NEXT: s_cselect_b32 s18, -1, 0 @@ -10023,13 +9943,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cmp_eq_u32 s17, s3 ; GFX9-NEXT: s_cselect_b32 s18, s19, s18 ; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s15, s15, s3 -; GFX9-NEXT: s_sub_u32 s19, s16, s2 -; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s10, s15, 0 +; GFX9-NEXT: s_subb_u32 s10, s15, s3 +; GFX9-NEXT: s_sub_u32 s11, s16, s2 +; GFX9-NEXT: s_subb_u32 s10, s10, 0 ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cselect_b32 s11, s19, s16 +; GFX9-NEXT: s_cselect_b32 s11, s11, s16 ; GFX9-NEXT: s_cselect_b32 s10, s10, s17 ; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX9-NEXT: s_subb_u32 s7, s7, s14 |