diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/carryout-selection.ll')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/carryout-selection.ll | 614 |
1 files changed, 345 insertions, 269 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index b96de17..4a6fa4f 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -704,6 +704,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; CISI-NEXT: s_add_u32 s4, s4, s6 ; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0 ; CISI-NEXT: s_or_b32 s6, s12, s13 +; CISI-NEXT: s_cmp_lg_u32 s6, 0 ; CISI-NEXT: s_addc_u32 s5, s5, s7 ; CISI-NEXT: s_mov_b32 s8, s0 ; CISI-NEXT: s_mov_b32 s9, s1 @@ -724,14 +725,16 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_add_u32 s2, s4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_add_u32 s0, s4, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_addc_u32 s1, s5, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 +; VI-NEXT: s_addc_u32 s0, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v5, s0 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -743,10 +746,12 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s12, s14 -; GFX9-NEXT: s_addc_u32 s1, s13, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_add_u32 s2, s12, s14 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_addc_u32 s0, s13, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -759,8 +764,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_add_u32 s0, s12, s14 -; GFX1010-NEXT: s_addc_u32 s1, s13, s15 +; GFX1010-NEXT: s_cselect_b32 s1, -1, 0 ; GFX1010-NEXT: v_mov_b32_e32 v0, s0 +; GFX1010-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1010-NEXT: s_addc_u32 s1, s13, s15 ; GFX1010-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -774,8 +781,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_add_u32 s4, s4, s6 -; GFX1030W32-NEXT: s_addc_u32 s5, s5, s7 +; GFX1030W32-NEXT: s_cselect_b32 s6, -1, 0 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4 +; GFX1030W32-NEXT: s_cmp_lg_u32 s6, 0 +; GFX1030W32-NEXT: s_addc_u32 s5, s5, s7 ; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 @@ -789,8 +798,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_add_u32 s4, s4, s6 -; GFX1030W64-NEXT: s_addc_u32 s5, s5, s7 +; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX1030W64-NEXT: s_addc_u32 s5, s5, s7 ; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5 ; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] @@ -803,8 +814,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s4, s4, s6 -; GFX11-NEXT: s_addc_u32 s5, s5, s7 +; GFX11-NEXT: s_cselect_b32 s6, -1, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_cmp_lg_u32 s6, 0 +; GFX11-NEXT: s_addc_u32 s5, s5, s7 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 @@ -818,8 +831,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_add_co_u32 s0, s12, s14 -; GFX1250-NEXT: s_add_co_ci_u32 s1, s13, s15 +; GFX1250-NEXT: s_cselect_b32 s1, -1, 0 ; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1250-NEXT: s_add_co_ci_u32 s1, s13, s15 ; GFX1250-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -1676,6 +1691,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; CISI-NEXT: s_sub_u32 s4, s4, s6 ; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0 ; CISI-NEXT: s_or_b32 s6, s12, s13 +; CISI-NEXT: s_cmp_lg_u32 s6, 0 ; CISI-NEXT: s_subb_u32 s5, s5, s7 ; CISI-NEXT: s_mov_b32 s8, s0 ; CISI-NEXT: s_mov_b32 s9, s1 @@ -1696,14 +1712,16 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_sub_u32 s2, s4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_sub_u32 s0, s4, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_subb_u32 s1, s5, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 +; VI-NEXT: s_subb_u32 s0, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v5, s0 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -1715,10 +1733,12 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s0, s12, s14 -; GFX9-NEXT: s_subb_u32 s1, s13, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_sub_u32 s2, s12, s14 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_subb_u32 s0, s13, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -1731,8 +1751,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_sub_u32 s0, s12, s14 -; GFX1010-NEXT: s_subb_u32 s1, s13, s15 +; GFX1010-NEXT: s_cselect_b32 s1, -1, 0 ; GFX1010-NEXT: v_mov_b32_e32 v0, s0 +; GFX1010-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1010-NEXT: s_subb_u32 s1, s13, s15 ; GFX1010-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -1746,8 +1768,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_sub_u32 s4, s4, s6 -; GFX1030W32-NEXT: s_subb_u32 s5, s5, s7 +; GFX1030W32-NEXT: s_cselect_b32 s6, -1, 0 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4 +; GFX1030W32-NEXT: s_cmp_lg_u32 s6, 0 +; GFX1030W32-NEXT: s_subb_u32 s5, s5, s7 ; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 @@ -1761,8 +1785,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_sub_u32 s4, s4, s6 -; GFX1030W64-NEXT: s_subb_u32 s5, s5, s7 +; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX1030W64-NEXT: s_subb_u32 s5, s5, s7 ; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5 ; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] @@ -1775,8 +1801,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s4, s4, s6 -; GFX11-NEXT: s_subb_u32 s5, s5, s7 +; GFX11-NEXT: s_cselect_b32 s6, -1, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_cmp_lg_u32 s6, 0 +; GFX11-NEXT: s_subb_u32 s5, s5, s7 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 @@ -1790,8 +1818,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_sub_co_u32 s0, s12, s14 -; GFX1250-NEXT: s_sub_co_ci_u32 s1, s13, s15 +; GFX1250-NEXT: s_cselect_b32 s1, -1, 0 ; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1250-NEXT: s_sub_co_ci_u32 s1, s13, s15 ; GFX1250-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -2188,46 +2218,49 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; VI-NEXT: s_addc_u32 s6, s7, s9 ; VI-NEXT: s_addc_u32 s8, s8, 0 ; VI-NEXT: v_readfirstlane_b32 s7, v0 -; VI-NEXT: s_add_u32 s10, s6, s7 -; VI-NEXT: v_mov_b32_e32 v0, s10 +; VI-NEXT: s_add_u32 s12, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s4, v0, 0 -; VI-NEXT: s_addc_u32 s11, 0, s8 -; VI-NEXT: s_mul_i32 s8, s4, s11 +; VI-NEXT: s_addc_u32 s13, 0, s8 +; VI-NEXT: s_mul_i32 s8, s4, s13 ; VI-NEXT: v_readfirstlane_b32 s9, v1 ; VI-NEXT: s_add_i32 s8, s9, s8 -; VI-NEXT: s_mul_i32 s9, s5, s10 -; VI-NEXT: s_add_i32 s12, s8, s9 -; VI-NEXT: s_sub_i32 s13, s3, s12 +; VI-NEXT: s_mul_i32 s9, s5, s12 +; VI-NEXT: s_add_i32 s14, s8, s9 +; VI-NEXT: s_sub_i32 s10, s3, s14 ; VI-NEXT: v_readfirstlane_b32 s8, v0 -; VI-NEXT: s_sub_u32 s14, s2, s8 +; VI-NEXT: s_sub_u32 s15, s2, s8 ; VI-NEXT: s_cselect_b64 s[8:9], -1, 0 -; VI-NEXT: s_subb_u32 s13, s13, s5 -; VI-NEXT: s_sub_u32 s15, s14, s4 -; VI-NEXT: s_subb_u32 s13, s13, 0 -; VI-NEXT: s_cmp_ge_u32 s13, s5 +; VI-NEXT: s_cmp_lg_u64 s[8:9], 0 +; VI-NEXT: s_subb_u32 s16, s10, s5 +; VI-NEXT: s_sub_u32 s17, s15, s4 +; VI-NEXT: s_cselect_b64 s[10:11], -1, 0 +; VI-NEXT: s_cmp_lg_u64 s[10:11], 0 +; VI-NEXT: s_subb_u32 s10, s16, 0 +; VI-NEXT: s_cmp_ge_u32 s10, s5 +; VI-NEXT: s_cselect_b32 s11, -1, 0 +; VI-NEXT: s_cmp_ge_u32 s17, s4 ; VI-NEXT: s_cselect_b32 s16, -1, 0 -; VI-NEXT: s_cmp_ge_u32 s15, s4 -; VI-NEXT: s_cselect_b32 s15, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s13, s5 -; VI-NEXT: s_cselect_b32 s13, s15, s16 -; VI-NEXT: s_add_u32 s15, s10, 1 -; VI-NEXT: s_addc_u32 s16, s11, 0 -; VI-NEXT: s_add_u32 s17, s10, 2 -; VI-NEXT: s_addc_u32 s18, s11, 0 -; VI-NEXT: s_cmp_lg_u32 s13, 0 -; VI-NEXT: s_cselect_b32 s13, s17, s15 -; VI-NEXT: s_cselect_b32 s15, s18, s16 +; VI-NEXT: s_cmp_eq_u32 s10, s5 +; VI-NEXT: s_cselect_b32 s10, s16, s11 +; VI-NEXT: s_add_u32 s11, s12, 1 +; VI-NEXT: s_addc_u32 s16, s13, 0 +; VI-NEXT: s_add_u32 s17, s12, 2 +; VI-NEXT: s_addc_u32 s18, s13, 0 +; VI-NEXT: s_cmp_lg_u32 s10, 0 +; VI-NEXT: s_cselect_b32 s10, s17, s11 +; VI-NEXT: s_cselect_b32 s11, s18, s16 ; VI-NEXT: s_cmp_lg_u64 s[8:9], 0 -; VI-NEXT: s_subb_u32 s3, s3, s12 +; VI-NEXT: s_subb_u32 s3, s3, s14 ; VI-NEXT: s_cmp_ge_u32 s3, s5 ; VI-NEXT: s_cselect_b32 s8, -1, 0 -; VI-NEXT: s_cmp_ge_u32 s14, s4 +; VI-NEXT: s_cmp_ge_u32 s15, s4 ; VI-NEXT: s_cselect_b32 s9, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s3, s5 ; VI-NEXT: s_cselect_b32 s3, s9, s8 ; VI-NEXT: s_cmp_lg_u32 s3, 0 -; VI-NEXT: s_cselect_b32 s9, s15, s11 -; VI-NEXT: s_cselect_b32 s8, s13, s10 +; VI-NEXT: s_cselect_b32 s9, s11, s13 +; VI-NEXT: s_cselect_b32 s8, s10, s12 ; VI-NEXT: s_cbranch_execnz .LBB16_4 ; VI-NEXT: .LBB16_2: ; VI-NEXT: v_cvt_f32_u32_e32 v0, s4 @@ -2278,8 +2311,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_u32 s8, 0, s6 -; GFX9-NEXT: s_subb_u32 s9, 0, s7 +; GFX9-NEXT: s_sub_u32 s10, 0, s6 +; GFX9-NEXT: s_subb_u32 s11, 0, s7 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2288,102 +2321,109 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s10, v1 -; GFX9-NEXT: v_readfirstlane_b32 s11, v0 -; GFX9-NEXT: s_mul_i32 s12, s8, s10 -; GFX9-NEXT: s_mul_hi_u32 s14, s8, s11 -; GFX9-NEXT: s_mul_i32 s13, s9, s11 -; GFX9-NEXT: s_add_i32 s12, s14, s12 -; GFX9-NEXT: s_add_i32 s12, s12, s13 -; GFX9-NEXT: s_mul_i32 s15, s8, s11 -; GFX9-NEXT: s_mul_i32 s14, s11, s12 -; GFX9-NEXT: s_mul_hi_u32 s16, s11, s15 -; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12 +; GFX9-NEXT: v_readfirstlane_b32 s12, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: s_mul_i32 s9, s10, s12 +; GFX9-NEXT: s_mul_hi_u32 s14, s10, s8 +; GFX9-NEXT: s_mul_i32 s13, s11, s8 +; GFX9-NEXT: s_add_i32 s9, s14, s9 +; GFX9-NEXT: s_add_i32 s9, s9, s13 +; GFX9-NEXT: s_mul_i32 s15, s10, s8 +; GFX9-NEXT: s_mul_i32 s14, s8, s9 +; GFX9-NEXT: s_mul_hi_u32 s16, s8, s15 +; GFX9-NEXT: s_mul_hi_u32 s13, s8, s9 ; GFX9-NEXT: s_add_u32 s14, s16, s14 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_mul_hi_u32 s17, s10, s15 -; GFX9-NEXT: s_mul_i32 s15, s10, s15 +; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15 +; GFX9-NEXT: s_mul_i32 s15, s12, s15 ; GFX9-NEXT: s_add_u32 s14, s14, s15 -; GFX9-NEXT: s_mul_hi_u32 s16, s10, s12 +; GFX9-NEXT: s_mul_hi_u32 s16, s12, s9 ; GFX9-NEXT: s_addc_u32 s13, s13, s17 ; GFX9-NEXT: s_addc_u32 s14, s16, 0 -; GFX9-NEXT: s_mul_i32 s12, s10, s12 -; GFX9-NEXT: s_add_u32 s12, s13, s12 +; GFX9-NEXT: s_mul_i32 s9, s12, s9 +; GFX9-NEXT: s_add_u32 s9, s13, s9 ; GFX9-NEXT: s_addc_u32 s13, 0, s14 -; GFX9-NEXT: s_add_u32 s11, s11, s12 -; GFX9-NEXT: s_addc_u32 s10, s10, s13 -; GFX9-NEXT: s_mul_i32 s12, s8, s10 -; GFX9-NEXT: s_mul_hi_u32 s13, s8, s11 -; GFX9-NEXT: s_add_i32 s12, s13, s12 -; GFX9-NEXT: s_mul_i32 s9, s9, s11 -; GFX9-NEXT: s_add_i32 s12, s12, s9 -; GFX9-NEXT: s_mul_i32 s8, s8, s11 -; GFX9-NEXT: s_mul_hi_u32 s13, s10, s8 -; GFX9-NEXT: s_mul_i32 s14, s10, s8 -; GFX9-NEXT: s_mul_i32 s16, s11, s12 -; GFX9-NEXT: s_mul_hi_u32 s8, s11, s8 -; GFX9-NEXT: s_mul_hi_u32 s15, s11, s12 -; GFX9-NEXT: s_add_u32 s8, s8, s16 +; GFX9-NEXT: s_add_u32 s14, s8, s9 +; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX9-NEXT: s_addc_u32 s12, s12, s13 +; GFX9-NEXT: s_mul_i32 s8, s10, s12 +; GFX9-NEXT: s_mul_hi_u32 s9, s10, s14 +; GFX9-NEXT: s_add_i32 s8, s9, s8 +; GFX9-NEXT: s_mul_i32 s11, s11, s14 +; GFX9-NEXT: s_add_i32 s8, s8, s11 +; GFX9-NEXT: s_mul_i32 s10, s10, s14 +; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10 +; GFX9-NEXT: s_mul_i32 s13, s12, s10 +; GFX9-NEXT: s_mul_i32 s16, s14, s8 +; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10 +; GFX9-NEXT: s_mul_hi_u32 s15, s14, s8 +; GFX9-NEXT: s_add_u32 s10, s10, s16 ; GFX9-NEXT: s_addc_u32 s15, 0, s15 -; GFX9-NEXT: s_add_u32 s8, s8, s14 -; GFX9-NEXT: s_mul_hi_u32 s9, s10, s12 -; GFX9-NEXT: s_addc_u32 s8, s15, s13 +; GFX9-NEXT: s_add_u32 s10, s10, s13 +; GFX9-NEXT: s_mul_hi_u32 s9, s12, s8 +; GFX9-NEXT: s_addc_u32 s10, s15, s11 ; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: s_mul_i32 s12, s10, s12 -; GFX9-NEXT: s_add_u32 s8, s8, s12 +; GFX9-NEXT: s_mul_i32 s8, s12, s8 +; GFX9-NEXT: s_add_u32 s8, s10, s8 +; GFX9-NEXT: s_addc_u32 s10, 0, s9 +; GFX9-NEXT: s_add_u32 s11, s14, s8 +; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX9-NEXT: s_addc_u32 s8, s12, s10 +; GFX9-NEXT: s_mul_i32 s10, s2, s8 +; GFX9-NEXT: s_mul_hi_u32 s12, s2, s11 +; GFX9-NEXT: s_mul_hi_u32 s9, s2, s8 +; GFX9-NEXT: s_add_u32 s10, s12, s10 ; GFX9-NEXT: s_addc_u32 s9, 0, s9 -; GFX9-NEXT: s_add_u32 s8, s11, s8 -; GFX9-NEXT: s_addc_u32 s9, s10, s9 -; GFX9-NEXT: s_mul_i32 s11, s2, s9 -; GFX9-NEXT: s_mul_hi_u32 s12, s2, s8 -; GFX9-NEXT: s_mul_hi_u32 s10, s2, s9 -; GFX9-NEXT: s_add_u32 s11, s12, s11 -; GFX9-NEXT: s_addc_u32 s10, 0, s10 -; GFX9-NEXT: s_mul_hi_u32 s13, s3, s8 -; GFX9-NEXT: s_mul_i32 s8, s3, s8 -; GFX9-NEXT: s_add_u32 s8, s11, s8 -; GFX9-NEXT: s_mul_hi_u32 s12, s3, s9 -; GFX9-NEXT: s_addc_u32 s8, s10, s13 +; GFX9-NEXT: s_mul_hi_u32 s13, s3, s11 +; GFX9-NEXT: s_mul_i32 s11, s3, s11 +; GFX9-NEXT: s_add_u32 s10, s10, s11 +; GFX9-NEXT: s_mul_hi_u32 s12, s3, s8 +; GFX9-NEXT: s_addc_u32 s9, s9, s13 ; GFX9-NEXT: s_addc_u32 s10, s12, 0 -; GFX9-NEXT: s_mul_i32 s9, s3, s9 -; GFX9-NEXT: s_add_u32 s11, s8, s9 -; GFX9-NEXT: s_addc_u32 s10, 0, s10 -; GFX9-NEXT: s_mul_i32 s8, s6, s10 -; GFX9-NEXT: s_mul_hi_u32 s9, s6, s11 +; GFX9-NEXT: s_mul_i32 s8, s3, s8 +; GFX9-NEXT: s_add_u32 s12, s9, s8 +; GFX9-NEXT: s_addc_u32 s13, 0, s10 +; GFX9-NEXT: s_mul_i32 s8, s6, s13 +; GFX9-NEXT: s_mul_hi_u32 s9, s6, s12 ; GFX9-NEXT: s_add_i32 s8, s9, s8 -; GFX9-NEXT: s_mul_i32 s9, s7, s11 -; GFX9-NEXT: s_add_i32 s12, s8, s9 -; GFX9-NEXT: s_sub_i32 s13, s3, s12 -; GFX9-NEXT: s_mul_i32 s8, s6, s11 -; GFX9-NEXT: s_sub_u32 s14, s2, s8 +; GFX9-NEXT: s_mul_i32 s9, s7, s12 +; GFX9-NEXT: s_add_i32 s14, s8, s9 +; GFX9-NEXT: s_sub_i32 s10, s3, s14 +; GFX9-NEXT: s_mul_i32 s8, s6, s12 +; GFX9-NEXT: s_sub_u32 s15, s2, s8 ; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX9-NEXT: s_subb_u32 s13, s13, s7 -; GFX9-NEXT: s_sub_u32 s15, s14, s6 -; GFX9-NEXT: s_subb_u32 s13, s13, 0 -; GFX9-NEXT: s_cmp_ge_u32 s13, s7 +; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX9-NEXT: s_subb_u32 s16, s10, s7 +; GFX9-NEXT: s_sub_u32 s17, s15, s6 +; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GFX9-NEXT: s_subb_u32 s10, s16, 0 +; GFX9-NEXT: s_cmp_ge_u32 s10, s7 +; GFX9-NEXT: s_cselect_b32 s11, -1, 0 +; GFX9-NEXT: s_cmp_ge_u32 s17, s6 ; GFX9-NEXT: s_cselect_b32 s16, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s15, s6 -; GFX9-NEXT: s_cselect_b32 s15, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s13, s7 -; GFX9-NEXT: s_cselect_b32 s13, s15, s16 -; GFX9-NEXT: s_add_u32 s15, s11, 1 -; GFX9-NEXT: s_addc_u32 s16, s10, 0 -; GFX9-NEXT: s_add_u32 s17, s11, 2 -; GFX9-NEXT: s_addc_u32 s18, s10, 0 -; GFX9-NEXT: s_cmp_lg_u32 s13, 0 -; GFX9-NEXT: s_cselect_b32 s13, s17, s15 -; GFX9-NEXT: s_cselect_b32 s15, s18, s16 +; GFX9-NEXT: s_cmp_eq_u32 s10, s7 +; GFX9-NEXT: s_cselect_b32 s10, s16, s11 +; GFX9-NEXT: s_add_u32 s11, s12, 1 +; GFX9-NEXT: s_addc_u32 s16, s13, 0 +; GFX9-NEXT: s_add_u32 s17, s12, 2 +; GFX9-NEXT: s_addc_u32 s18, s13, 0 +; GFX9-NEXT: s_cmp_lg_u32 s10, 0 +; GFX9-NEXT: s_cselect_b32 s10, s17, s11 +; GFX9-NEXT: s_cselect_b32 s11, s18, s16 ; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 -; GFX9-NEXT: s_subb_u32 s3, s3, s12 +; GFX9-NEXT: s_subb_u32 s3, s3, s14 ; GFX9-NEXT: s_cmp_ge_u32 s3, s7 ; GFX9-NEXT: s_cselect_b32 s8, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s14, s6 +; GFX9-NEXT: s_cmp_ge_u32 s15, s6 ; GFX9-NEXT: s_cselect_b32 s9, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s3, s7 ; GFX9-NEXT: s_cselect_b32 s3, s9, s8 ; GFX9-NEXT: s_cmp_lg_u32 s3, 0 -; GFX9-NEXT: s_cselect_b32 s9, s15, s10 -; GFX9-NEXT: s_cselect_b32 s8, s13, s11 +; GFX9-NEXT: s_cselect_b32 s9, s11, s13 +; GFX9-NEXT: s_cselect_b32 s8, s10, s12 ; GFX9-NEXT: s_cbranch_execnz .LBB16_3 ; GFX9-NEXT: .LBB16_2: ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 @@ -2463,40 +2503,44 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_add_u32 s11, s12, s11 ; GFX1010-NEXT: s_addc_u32 s12, 0, s13 ; GFX1010-NEXT: s_add_u32 s8, s8, s11 +; GFX1010-NEXT: s_cselect_b32 s11, -1, 0 +; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s8 +; GFX1010-NEXT: s_cmp_lg_u32 s11, 0 +; GFX1010-NEXT: s_mul_i32 s11, s9, s8 ; GFX1010-NEXT: s_addc_u32 s5, s5, s12 -; GFX1010-NEXT: s_mul_hi_u32 s11, s9, s8 -; GFX1010-NEXT: s_mul_i32 s12, s9, s8 -; GFX1010-NEXT: s_mul_i32 s9, s9, s5 ; GFX1010-NEXT: s_mul_i32 s10, s10, s8 -; GFX1010-NEXT: s_add_i32 s9, s11, s9 -; GFX1010-NEXT: s_mul_i32 s11, s5, s12 +; GFX1010-NEXT: s_mul_i32 s9, s9, s5 +; GFX1010-NEXT: s_mul_hi_u32 s12, s8, s11 +; GFX1010-NEXT: s_add_i32 s9, s13, s9 +; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s11 ; GFX1010-NEXT: s_add_i32 s9, s9, s10 -; GFX1010-NEXT: s_mul_hi_u32 s10, s8, s12 +; GFX1010-NEXT: s_mul_i32 s10, s5, s11 ; GFX1010-NEXT: s_mul_i32 s15, s8, s9 ; GFX1010-NEXT: s_mul_hi_u32 s14, s8, s9 -; GFX1010-NEXT: s_add_u32 s10, s10, s15 -; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s12 +; GFX1010-NEXT: s_add_u32 s12, s12, s15 ; GFX1010-NEXT: s_addc_u32 s14, 0, s14 -; GFX1010-NEXT: s_mul_hi_u32 s12, s5, s9 -; GFX1010-NEXT: s_add_u32 s10, s10, s11 +; GFX1010-NEXT: s_mul_hi_u32 s11, s5, s9 +; GFX1010-NEXT: s_add_u32 s10, s12, s10 ; GFX1010-NEXT: s_mul_i32 s9, s5, s9 ; GFX1010-NEXT: s_addc_u32 s10, s14, s13 -; GFX1010-NEXT: s_addc_u32 s11, s12, 0 +; GFX1010-NEXT: s_addc_u32 s11, s11, 0 ; GFX1010-NEXT: s_add_u32 s9, s10, s9 ; GFX1010-NEXT: s_addc_u32 s10, 0, s11 ; GFX1010-NEXT: s_add_u32 s8, s8, s9 +; GFX1010-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s8 +; GFX1010-NEXT: s_cmp_lg_u32 s9, 0 +; GFX1010-NEXT: s_mul_hi_u32 s9, s3, s8 ; GFX1010-NEXT: s_addc_u32 s5, s5, s10 -; GFX1010-NEXT: s_mul_hi_u32 s9, s2, s8 -; GFX1010-NEXT: s_mul_i32 s12, s2, s5 -; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s5 -; GFX1010-NEXT: s_mul_hi_u32 s10, s3, s8 ; GFX1010-NEXT: s_mul_i32 s8, s3, s8 -; GFX1010-NEXT: s_add_u32 s9, s9, s12 -; GFX1010-NEXT: s_addc_u32 s11, 0, s11 +; GFX1010-NEXT: s_mul_i32 s12, s2, s5 +; GFX1010-NEXT: s_mul_hi_u32 s10, s2, s5 +; GFX1010-NEXT: s_add_u32 s11, s11, s12 +; GFX1010-NEXT: s_addc_u32 s10, 0, s10 ; GFX1010-NEXT: s_mul_hi_u32 s13, s3, s5 -; GFX1010-NEXT: s_add_u32 s8, s9, s8 +; GFX1010-NEXT: s_add_u32 s8, s11, s8 ; GFX1010-NEXT: s_mul_i32 s5, s3, s5 -; GFX1010-NEXT: s_addc_u32 s8, s11, s10 +; GFX1010-NEXT: s_addc_u32 s8, s10, s9 ; GFX1010-NEXT: s_addc_u32 s9, s13, 0 ; GFX1010-NEXT: s_add_u32 s5, s8, s5 ; GFX1010-NEXT: s_addc_u32 s8, 0, s9 @@ -2509,8 +2553,11 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_sub_i32 s11, s3, s9 ; GFX1010-NEXT: s_sub_u32 s10, s2, s10 ; GFX1010-NEXT: s_cselect_b32 s12, -1, 0 +; GFX1010-NEXT: s_cmp_lg_u32 s12, 0 ; GFX1010-NEXT: s_subb_u32 s11, s11, s7 ; GFX1010-NEXT: s_sub_u32 s13, s10, s6 +; GFX1010-NEXT: s_cselect_b32 s14, -1, 0 +; GFX1010-NEXT: s_cmp_lg_u32 s14, 0 ; GFX1010-NEXT: s_subb_u32 s11, s11, 0 ; GFX1010-NEXT: s_cmp_ge_u32 s11, s7 ; GFX1010-NEXT: s_cselect_b32 s14, -1, 0 @@ -2616,40 +2663,44 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: s_add_u32 s11, s12, s11 ; GFX1030W32-NEXT: s_addc_u32 s12, 0, s13 ; GFX1030W32-NEXT: s_add_u32 s8, s8, s11 +; GFX1030W32-NEXT: s_cselect_b32 s11, -1, 0 +; GFX1030W32-NEXT: s_mul_hi_u32 s13, s9, s8 +; GFX1030W32-NEXT: s_cmp_lg_u32 s11, 0 +; GFX1030W32-NEXT: s_mul_i32 s11, s9, s8 ; GFX1030W32-NEXT: s_addc_u32 s7, s7, s12 -; GFX1030W32-NEXT: s_mul_hi_u32 s11, s9, s8 -; GFX1030W32-NEXT: s_mul_i32 s12, s9, s8 -; GFX1030W32-NEXT: s_mul_i32 s9, s9, s7 ; GFX1030W32-NEXT: s_mul_i32 s10, s10, s8 -; GFX1030W32-NEXT: s_add_i32 s9, s11, s9 -; GFX1030W32-NEXT: s_mul_i32 s11, s7, s12 +; GFX1030W32-NEXT: s_mul_i32 s9, s9, s7 +; GFX1030W32-NEXT: s_mul_hi_u32 s12, s8, s11 +; GFX1030W32-NEXT: s_add_i32 s9, s13, s9 +; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s11 ; GFX1030W32-NEXT: s_add_i32 s9, s9, s10 -; GFX1030W32-NEXT: s_mul_hi_u32 s10, s8, s12 +; GFX1030W32-NEXT: s_mul_i32 s10, s7, s11 ; GFX1030W32-NEXT: s_mul_i32 s15, s8, s9 ; GFX1030W32-NEXT: s_mul_hi_u32 s14, s8, s9 -; GFX1030W32-NEXT: s_add_u32 s10, s10, s15 -; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s12 +; GFX1030W32-NEXT: s_add_u32 s12, s12, s15 ; GFX1030W32-NEXT: s_addc_u32 s14, 0, s14 -; GFX1030W32-NEXT: s_mul_hi_u32 s12, s7, s9 -; GFX1030W32-NEXT: s_add_u32 s10, s10, s11 +; GFX1030W32-NEXT: s_mul_hi_u32 s11, s7, s9 +; GFX1030W32-NEXT: s_add_u32 s10, s12, s10 ; GFX1030W32-NEXT: s_mul_i32 s9, s7, s9 ; GFX1030W32-NEXT: s_addc_u32 s10, s14, s13 -; GFX1030W32-NEXT: s_addc_u32 s11, s12, 0 +; GFX1030W32-NEXT: s_addc_u32 s11, s11, 0 ; GFX1030W32-NEXT: s_add_u32 s9, s10, s9 ; GFX1030W32-NEXT: s_addc_u32 s10, 0, s11 ; GFX1030W32-NEXT: s_add_u32 s8, s8, s9 +; GFX1030W32-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s8 +; GFX1030W32-NEXT: s_cmp_lg_u32 s9, 0 +; GFX1030W32-NEXT: s_mul_hi_u32 s9, s3, s8 ; GFX1030W32-NEXT: s_addc_u32 s7, s7, s10 -; GFX1030W32-NEXT: s_mul_hi_u32 s9, s2, s8 -; GFX1030W32-NEXT: s_mul_i32 s12, s2, s7 -; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s7 -; GFX1030W32-NEXT: s_mul_hi_u32 s10, s3, s8 ; GFX1030W32-NEXT: s_mul_i32 s8, s3, s8 -; GFX1030W32-NEXT: s_add_u32 s9, s9, s12 -; GFX1030W32-NEXT: s_addc_u32 s11, 0, s11 +; GFX1030W32-NEXT: s_mul_i32 s12, s2, s7 +; GFX1030W32-NEXT: s_mul_hi_u32 s10, s2, s7 +; GFX1030W32-NEXT: s_add_u32 s11, s11, s12 +; GFX1030W32-NEXT: s_addc_u32 s10, 0, s10 ; GFX1030W32-NEXT: s_mul_hi_u32 s13, s3, s7 -; GFX1030W32-NEXT: s_add_u32 s8, s9, s8 +; GFX1030W32-NEXT: s_add_u32 s8, s11, s8 ; GFX1030W32-NEXT: s_mul_i32 s7, s3, s7 -; GFX1030W32-NEXT: s_addc_u32 s8, s11, s10 +; GFX1030W32-NEXT: s_addc_u32 s8, s10, s9 ; GFX1030W32-NEXT: s_addc_u32 s9, s13, 0 ; GFX1030W32-NEXT: s_add_u32 s7, s8, s7 ; GFX1030W32-NEXT: s_addc_u32 s8, 0, s9 @@ -2662,8 +2713,11 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: s_sub_i32 s11, s3, s9 ; GFX1030W32-NEXT: s_sub_u32 s10, s2, s10 ; GFX1030W32-NEXT: s_cselect_b32 s12, -1, 0 +; GFX1030W32-NEXT: s_cmp_lg_u32 s12, 0 ; GFX1030W32-NEXT: s_subb_u32 s11, s11, s5 ; GFX1030W32-NEXT: s_sub_u32 s13, s10, s4 +; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0 +; GFX1030W32-NEXT: s_cmp_lg_u32 s14, 0 ; GFX1030W32-NEXT: s_subb_u32 s11, s11, 0 ; GFX1030W32-NEXT: s_cmp_ge_u32 s11, s5 ; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0 @@ -2736,8 +2790,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W64-NEXT: ; %bb.1: ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GFX1030W64-NEXT: s_sub_u32 s8, 0, s4 -; GFX1030W64-NEXT: s_subb_u32 s9, 0, s5 +; GFX1030W64-NEXT: s_sub_u32 s9, 0, s4 +; GFX1030W64-NEXT: s_subb_u32 s10, 0, s5 ; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0 ; GFX1030W64-NEXT: v_rcp_f32_e32 v0, v0 ; GFX1030W64-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2746,102 +2800,109 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0 ; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1030W64-NEXT: v_readfirstlane_b32 s6, v1 -; GFX1030W64-NEXT: v_readfirstlane_b32 s7, v0 -; GFX1030W64-NEXT: s_mul_i32 s10, s8, s6 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s8, s7 -; GFX1030W64-NEXT: s_mul_i32 s11, s9, s7 -; GFX1030W64-NEXT: s_add_i32 s10, s12, s10 -; GFX1030W64-NEXT: s_mul_i32 s13, s8, s7 -; GFX1030W64-NEXT: s_add_i32 s10, s10, s11 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s7, s13 -; GFX1030W64-NEXT: s_mul_i32 s15, s7, s10 -; GFX1030W64-NEXT: s_mul_hi_u32 s14, s6, s13 -; GFX1030W64-NEXT: s_mul_i32 s11, s6, s13 -; GFX1030W64-NEXT: s_mul_hi_u32 s13, s7, s10 +; GFX1030W64-NEXT: v_readfirstlane_b32 s8, v1 +; GFX1030W64-NEXT: v_readfirstlane_b32 s6, v0 +; GFX1030W64-NEXT: s_mul_i32 s7, s9, s8 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s9, s6 +; GFX1030W64-NEXT: s_mul_i32 s11, s10, s6 +; GFX1030W64-NEXT: s_add_i32 s7, s12, s7 +; GFX1030W64-NEXT: s_mul_i32 s13, s9, s6 +; GFX1030W64-NEXT: s_add_i32 s7, s7, s11 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s13 +; GFX1030W64-NEXT: s_mul_i32 s15, s6, s7 +; GFX1030W64-NEXT: s_mul_hi_u32 s14, s8, s13 +; GFX1030W64-NEXT: s_mul_i32 s11, s8, s13 +; GFX1030W64-NEXT: s_mul_hi_u32 s13, s6, s7 ; GFX1030W64-NEXT: s_add_u32 s12, s12, s15 ; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13 -; GFX1030W64-NEXT: s_mul_hi_u32 s16, s6, s10 +; GFX1030W64-NEXT: s_mul_hi_u32 s16, s8, s7 ; GFX1030W64-NEXT: s_add_u32 s11, s12, s11 -; GFX1030W64-NEXT: s_mul_i32 s10, s6, s10 +; GFX1030W64-NEXT: s_mul_i32 s7, s8, s7 ; GFX1030W64-NEXT: s_addc_u32 s11, s13, s14 ; GFX1030W64-NEXT: s_addc_u32 s12, s16, 0 -; GFX1030W64-NEXT: s_add_u32 s10, s11, s10 +; GFX1030W64-NEXT: s_add_u32 s7, s11, s7 ; GFX1030W64-NEXT: s_addc_u32 s11, 0, s12 -; GFX1030W64-NEXT: s_add_u32 s7, s7, s10 -; GFX1030W64-NEXT: s_addc_u32 s6, s6, s11 -; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s7 -; GFX1030W64-NEXT: s_mul_i32 s11, s8, s7 -; GFX1030W64-NEXT: s_mul_i32 s8, s8, s6 -; GFX1030W64-NEXT: s_mul_i32 s9, s9, s7 -; GFX1030W64-NEXT: s_add_i32 s8, s10, s8 -; GFX1030W64-NEXT: s_mul_i32 s10, s6, s11 -; GFX1030W64-NEXT: s_add_i32 s8, s8, s9 -; GFX1030W64-NEXT: s_mul_hi_u32 s9, s7, s11 -; GFX1030W64-NEXT: s_mul_i32 s14, s7, s8 -; GFX1030W64-NEXT: s_mul_hi_u32 s13, s7, s8 -; GFX1030W64-NEXT: s_add_u32 s9, s9, s14 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s11 +; GFX1030W64-NEXT: s_add_u32 s12, s6, s7 +; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX1030W64-NEXT: s_mul_hi_u32 s13, s9, s12 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX1030W64-NEXT: s_mul_i32 s6, s9, s12 +; GFX1030W64-NEXT: s_addc_u32 s8, s8, s11 +; GFX1030W64-NEXT: s_mul_i32 s10, s10, s12 +; GFX1030W64-NEXT: s_mul_i32 s9, s9, s8 +; GFX1030W64-NEXT: s_mul_hi_u32 s7, s12, s6 +; GFX1030W64-NEXT: s_add_i32 s9, s13, s9 +; GFX1030W64-NEXT: s_mul_hi_u32 s11, s8, s6 +; GFX1030W64-NEXT: s_add_i32 s9, s9, s10 +; GFX1030W64-NEXT: s_mul_i32 s6, s8, s6 +; GFX1030W64-NEXT: s_mul_i32 s14, s12, s9 +; GFX1030W64-NEXT: s_mul_hi_u32 s13, s12, s9 +; GFX1030W64-NEXT: s_add_u32 s7, s7, s14 ; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13 -; GFX1030W64-NEXT: s_mul_hi_u32 s11, s6, s8 -; GFX1030W64-NEXT: s_add_u32 s9, s9, s10 -; GFX1030W64-NEXT: s_mul_i32 s8, s6, s8 -; GFX1030W64-NEXT: s_addc_u32 s9, s13, s12 -; GFX1030W64-NEXT: s_addc_u32 s10, s11, 0 -; GFX1030W64-NEXT: s_add_u32 s8, s9, s8 -; GFX1030W64-NEXT: s_addc_u32 s9, 0, s10 -; GFX1030W64-NEXT: s_add_u32 s7, s7, s8 -; GFX1030W64-NEXT: s_addc_u32 s6, s6, s9 -; GFX1030W64-NEXT: s_mul_hi_u32 s8, s2, s7 -; GFX1030W64-NEXT: s_mul_i32 s11, s2, s6 -; GFX1030W64-NEXT: s_mul_hi_u32 s10, s2, s6 -; GFX1030W64-NEXT: s_mul_hi_u32 s9, s3, s7 +; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s9 +; GFX1030W64-NEXT: s_add_u32 s6, s7, s6 +; GFX1030W64-NEXT: s_mul_i32 s9, s8, s9 +; GFX1030W64-NEXT: s_addc_u32 s6, s13, s11 +; GFX1030W64-NEXT: s_addc_u32 s7, s10, 0 +; GFX1030W64-NEXT: s_add_u32 s6, s6, s9 +; GFX1030W64-NEXT: s_addc_u32 s9, 0, s7 +; GFX1030W64-NEXT: s_add_u32 s10, s12, s6 +; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX1030W64-NEXT: s_mul_hi_u32 s11, s2, s10 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX1030W64-NEXT: s_mul_hi_u32 s6, s3, s10 +; GFX1030W64-NEXT: s_addc_u32 s7, s8, s9 +; GFX1030W64-NEXT: s_mul_i32 s8, s3, s10 +; GFX1030W64-NEXT: s_mul_i32 s10, s2, s7 +; GFX1030W64-NEXT: s_mul_hi_u32 s9, s2, s7 +; GFX1030W64-NEXT: s_add_u32 s10, s11, s10 +; GFX1030W64-NEXT: s_addc_u32 s9, 0, s9 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s3, s7 +; GFX1030W64-NEXT: s_add_u32 s8, s10, s8 ; GFX1030W64-NEXT: s_mul_i32 s7, s3, s7 -; GFX1030W64-NEXT: s_add_u32 s8, s8, s11 -; GFX1030W64-NEXT: s_addc_u32 s10, 0, s10 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s3, s6 -; GFX1030W64-NEXT: s_add_u32 s7, s8, s7 -; GFX1030W64-NEXT: s_mul_i32 s6, s3, s6 -; GFX1030W64-NEXT: s_addc_u32 s7, s10, s9 +; GFX1030W64-NEXT: s_addc_u32 s6, s9, s6 ; GFX1030W64-NEXT: s_addc_u32 s8, s12, 0 -; GFX1030W64-NEXT: s_add_u32 s10, s7, s6 +; GFX1030W64-NEXT: s_add_u32 s10, s6, s7 ; GFX1030W64-NEXT: s_addc_u32 s11, 0, s8 ; GFX1030W64-NEXT: s_mul_hi_u32 s6, s4, s10 ; GFX1030W64-NEXT: s_mul_i32 s7, s4, s11 ; GFX1030W64-NEXT: s_mul_i32 s8, s5, s10 ; GFX1030W64-NEXT: s_add_i32 s6, s6, s7 -; GFX1030W64-NEXT: s_add_i32 s8, s6, s8 +; GFX1030W64-NEXT: s_add_i32 s12, s6, s8 ; GFX1030W64-NEXT: s_mul_i32 s6, s4, s10 -; GFX1030W64-NEXT: s_sub_i32 s9, s3, s8 -; GFX1030W64-NEXT: s_sub_u32 s12, s2, s6 +; GFX1030W64-NEXT: s_sub_i32 s8, s3, s12 +; GFX1030W64-NEXT: s_sub_u32 s13, s2, s6 ; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX1030W64-NEXT: s_subb_u32 s9, s9, s5 -; GFX1030W64-NEXT: s_sub_u32 s13, s12, s4 -; GFX1030W64-NEXT: s_subb_u32 s9, s9, 0 -; GFX1030W64-NEXT: s_cmp_ge_u32 s9, s5 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX1030W64-NEXT: s_subb_u32 s14, s8, s5 +; GFX1030W64-NEXT: s_sub_u32 s15, s13, s4 +; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX1030W64-NEXT: s_subb_u32 s8, s14, 0 +; GFX1030W64-NEXT: s_cmp_ge_u32 s8, s5 +; GFX1030W64-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1030W64-NEXT: s_cmp_ge_u32 s15, s4 ; GFX1030W64-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1030W64-NEXT: s_cmp_ge_u32 s13, s4 -; GFX1030W64-NEXT: s_cselect_b32 s13, -1, 0 -; GFX1030W64-NEXT: s_cmp_eq_u32 s9, s5 -; GFX1030W64-NEXT: s_cselect_b32 s9, s13, s14 -; GFX1030W64-NEXT: s_add_u32 s13, s10, 1 +; GFX1030W64-NEXT: s_cmp_eq_u32 s8, s5 +; GFX1030W64-NEXT: s_cselect_b32 s8, s14, s9 +; GFX1030W64-NEXT: s_add_u32 s9, s10, 1 ; GFX1030W64-NEXT: s_addc_u32 s14, s11, 0 ; GFX1030W64-NEXT: s_add_u32 s15, s10, 2 ; GFX1030W64-NEXT: s_addc_u32 s16, s11, 0 -; GFX1030W64-NEXT: s_cmp_lg_u32 s9, 0 -; GFX1030W64-NEXT: s_cselect_b32 s13, s15, s13 +; GFX1030W64-NEXT: s_cmp_lg_u32 s8, 0 +; GFX1030W64-NEXT: s_cselect_b32 s15, s15, s9 ; GFX1030W64-NEXT: s_cselect_b32 s14, s16, s14 ; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1030W64-NEXT: s_subb_u32 s3, s3, s8 +; GFX1030W64-NEXT: s_subb_u32 s3, s3, s12 ; GFX1030W64-NEXT: s_cmp_ge_u32 s3, s5 ; GFX1030W64-NEXT: s_cselect_b32 s6, -1, 0 -; GFX1030W64-NEXT: s_cmp_ge_u32 s12, s4 +; GFX1030W64-NEXT: s_cmp_ge_u32 s13, s4 ; GFX1030W64-NEXT: s_cselect_b32 s7, -1, 0 ; GFX1030W64-NEXT: s_cmp_eq_u32 s3, s5 ; GFX1030W64-NEXT: s_cselect_b32 s3, s7, s6 ; GFX1030W64-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1030W64-NEXT: s_cselect_b32 s7, s14, s11 -; GFX1030W64-NEXT: s_cselect_b32 s6, s13, s10 +; GFX1030W64-NEXT: s_cselect_b32 s6, s15, s10 ; GFX1030W64-NEXT: s_cbranch_execnz .LBB16_3 ; GFX1030W64-NEXT: .LBB16_2: ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4 @@ -2927,40 +2988,44 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_add_u32 s11, s12, s11 ; GFX11-NEXT: s_addc_u32 s12, 0, s13 ; GFX11-NEXT: s_add_u32 s8, s8, s11 +; GFX11-NEXT: s_cselect_b32 s11, -1, 0 +; GFX11-NEXT: s_mul_hi_u32 s13, s9, s8 +; GFX11-NEXT: s_cmp_lg_u32 s11, 0 +; GFX11-NEXT: s_mul_i32 s11, s9, s8 ; GFX11-NEXT: s_addc_u32 s7, s7, s12 -; GFX11-NEXT: s_mul_hi_u32 s11, s9, s8 -; GFX11-NEXT: s_mul_i32 s12, s9, s8 -; GFX11-NEXT: s_mul_i32 s9, s9, s7 ; GFX11-NEXT: s_mul_i32 s10, s10, s8 -; GFX11-NEXT: s_add_i32 s9, s11, s9 -; GFX11-NEXT: s_mul_i32 s11, s7, s12 +; GFX11-NEXT: s_mul_i32 s9, s9, s7 +; GFX11-NEXT: s_mul_hi_u32 s12, s8, s11 +; GFX11-NEXT: s_add_i32 s9, s13, s9 +; GFX11-NEXT: s_mul_hi_u32 s13, s7, s11 ; GFX11-NEXT: s_add_i32 s9, s9, s10 -; GFX11-NEXT: s_mul_hi_u32 s10, s8, s12 +; GFX11-NEXT: s_mul_i32 s10, s7, s11 ; GFX11-NEXT: s_mul_i32 s15, s8, s9 ; GFX11-NEXT: s_mul_hi_u32 s14, s8, s9 -; GFX11-NEXT: s_add_u32 s10, s10, s15 -; GFX11-NEXT: s_mul_hi_u32 s13, s7, s12 +; GFX11-NEXT: s_add_u32 s12, s12, s15 ; GFX11-NEXT: s_addc_u32 s14, 0, s14 -; GFX11-NEXT: s_mul_hi_u32 s12, s7, s9 -; GFX11-NEXT: s_add_u32 s10, s10, s11 +; GFX11-NEXT: s_mul_hi_u32 s11, s7, s9 +; GFX11-NEXT: s_add_u32 s10, s12, s10 ; GFX11-NEXT: s_mul_i32 s9, s7, s9 ; GFX11-NEXT: s_addc_u32 s10, s14, s13 -; GFX11-NEXT: s_addc_u32 s11, s12, 0 +; GFX11-NEXT: s_addc_u32 s11, s11, 0 ; GFX11-NEXT: s_add_u32 s9, s10, s9 ; GFX11-NEXT: s_addc_u32 s10, 0, s11 ; GFX11-NEXT: s_add_u32 s8, s8, s9 +; GFX11-NEXT: s_cselect_b32 s9, -1, 0 +; GFX11-NEXT: s_mul_hi_u32 s11, s2, s8 +; GFX11-NEXT: s_cmp_lg_u32 s9, 0 +; GFX11-NEXT: s_mul_hi_u32 s9, s3, s8 ; GFX11-NEXT: s_addc_u32 s7, s7, s10 -; GFX11-NEXT: s_mul_hi_u32 s9, s2, s8 -; GFX11-NEXT: s_mul_i32 s12, s2, s7 -; GFX11-NEXT: s_mul_hi_u32 s11, s2, s7 -; GFX11-NEXT: s_mul_hi_u32 s10, s3, s8 ; GFX11-NEXT: s_mul_i32 s8, s3, s8 -; GFX11-NEXT: s_add_u32 s9, s9, s12 -; GFX11-NEXT: s_addc_u32 s11, 0, s11 +; GFX11-NEXT: s_mul_i32 s12, s2, s7 +; GFX11-NEXT: s_mul_hi_u32 s10, s2, s7 +; GFX11-NEXT: s_add_u32 s11, s11, s12 +; GFX11-NEXT: s_addc_u32 s10, 0, s10 ; GFX11-NEXT: s_mul_hi_u32 s13, s3, s7 -; GFX11-NEXT: s_add_u32 s8, s9, s8 +; GFX11-NEXT: s_add_u32 s8, s11, s8 ; GFX11-NEXT: s_mul_i32 s7, s3, s7 -; GFX11-NEXT: s_addc_u32 s8, s11, s10 +; GFX11-NEXT: s_addc_u32 s8, s10, s9 ; GFX11-NEXT: s_addc_u32 s9, s13, 0 ; GFX11-NEXT: s_add_u32 s7, s8, s7 ; GFX11-NEXT: s_addc_u32 s8, 0, s9 @@ -2970,14 +3035,17 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_add_i32 s9, s9, s10 ; GFX11-NEXT: s_mul_i32 s10, s4, s7 ; GFX11-NEXT: s_add_i32 s9, s9, s11 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_sub_i32 s11, s3, s9 ; GFX11-NEXT: s_sub_u32 s10, s2, s10 ; GFX11-NEXT: s_cselect_b32 s12, -1, 0 +; GFX11-NEXT: s_cmp_lg_u32 s12, 0 ; GFX11-NEXT: s_subb_u32 s11, s11, s5 ; GFX11-NEXT: s_sub_u32 s13, s10, s4 +; GFX11-NEXT: s_cselect_b32 s14, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_cmp_lg_u32 s14, 0 ; GFX11-NEXT: s_subb_u32 s11, s11, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_cmp_ge_u32 s11, s5 ; GFX11-NEXT: s_cselect_b32 s14, -1, 0 ; GFX11-NEXT: s_cmp_ge_u32 s13, s4 @@ -3050,8 +3118,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_or_b64 s[4:5], s[2:3], s[6:7] -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_b64 s[4:5], s[4:5], 0xffffffff00000000 +; GFX1250-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1250-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1250-NEXT: ; %bb.1: ; GFX1250-NEXT: s_cvt_f32_u32 s4, s6 @@ -3086,9 +3155,12 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[4:5], s[12:13] ; GFX1250-NEXT: s_add_co_u32 s8, s8, s12 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1250-NEXT: s_add_co_ci_u32 s9, s9, s13 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_mul_u64 s[10:11], s[10:11], s[8:9] +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_mul_hi_u32 s13, s8, s11 ; GFX1250-NEXT: s_mul_i32 s12, s8, s11 ; GFX1250-NEXT: s_mul_hi_u32 s4, s8, s10 @@ -3103,17 +3175,19 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_add_nc_u64 s[10:11], s[4:5], s[10:11] ; GFX1250-NEXT: s_add_co_u32 s8, s8, s10 -; GFX1250-NEXT: s_add_co_ci_u32 s10, s9, s11 +; GFX1250-NEXT: s_cselect_b32 s10, -1, 0 ; GFX1250-NEXT: s_mul_hi_u32 s4, s2, s8 -; GFX1250-NEXT: s_mul_hi_u32 s11, s3, s8 -; GFX1250-NEXT: s_mul_i32 s12, s3, s8 +; GFX1250-NEXT: s_cmp_lg_u32 s10, 0 +; GFX1250-NEXT: s_mul_hi_u32 s12, s3, s8 +; GFX1250-NEXT: s_add_co_ci_u32 s10, s9, s11 +; GFX1250-NEXT: s_mul_i32 s11, s3, s8 ; GFX1250-NEXT: s_mul_hi_u32 s9, s2, s10 ; GFX1250-NEXT: s_mul_i32 s8, s2, s10 ; GFX1250-NEXT: s_mul_hi_u32 s13, s3, s10 ; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[8:9] ; GFX1250-NEXT: s_mul_i32 s10, s3, s10 -; GFX1250-NEXT: s_add_co_u32 s4, s8, s12 -; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s11 +; GFX1250-NEXT: s_add_co_u32 s4, s8, s11 +; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s12 ; GFX1250-NEXT: s_add_co_ci_u32 s11, s13, 0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[10:11] @@ -3128,8 +3202,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_cmp_lg_u32 s8, 0 ; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, s7 ; GFX1250-NEXT: s_sub_co_u32 s13, s4, s6 +; GFX1250-NEXT: s_cselect_b32 s14, -1, 0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: s_cmp_lg_u32 s14, 0 ; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, 0 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_cmp_ge_u32 s12, s7 ; GFX1250-NEXT: s_cselect_b32 s14, -1, 0 ; GFX1250-NEXT: s_cmp_ge_u32 s13, s6 |