diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/srem.ll')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/srem.ll | 654 |
1 files changed, 361 insertions, 293 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index e12e31b..bbd1793 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -1513,7 +1513,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GCN-NEXT: s_sub_u32 s3, 0, s8 -; GCN-NEXT: s_subb_u32 s10, 0, s9 +; GCN-NEXT: s_subb_u32 s12, 0, s9 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1522,52 +1522,56 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s11, v1 -; GCN-NEXT: v_readfirstlane_b32 s12, v0 -; GCN-NEXT: s_mul_i32 s13, s3, s11 -; GCN-NEXT: s_mul_hi_u32 s15, s3, s12 -; GCN-NEXT: s_mul_i32 s14, s10, s12 -; GCN-NEXT: s_add_i32 s13, s15, s13 -; GCN-NEXT: s_add_i32 s13, s13, s14 -; GCN-NEXT: s_mul_i32 s16, s3, s12 -; GCN-NEXT: s_mul_i32 s15, s12, s13 -; GCN-NEXT: s_mul_hi_u32 s17, s12, s16 -; GCN-NEXT: s_mul_hi_u32 s14, s12, s13 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-NEXT: s_mul_i32 s11, s3, s13 +; GCN-NEXT: s_mul_hi_u32 s15, s3, s10 +; GCN-NEXT: s_mul_i32 s14, s12, s10 +; GCN-NEXT: s_add_i32 s11, s15, s11 +; GCN-NEXT: s_add_i32 s11, s11, s14 +; GCN-NEXT: s_mul_i32 s16, s3, s10 +; GCN-NEXT: s_mul_i32 s15, s10, s11 +; GCN-NEXT: s_mul_hi_u32 s17, s10, s16 +; GCN-NEXT: s_mul_hi_u32 s14, s10, s11 ; GCN-NEXT: s_add_u32 s15, s17, s15 ; GCN-NEXT: s_addc_u32 s14, 0, s14 -; GCN-NEXT: s_mul_hi_u32 s18, s11, s16 -; GCN-NEXT: s_mul_i32 s16, s11, s16 +; GCN-NEXT: s_mul_hi_u32 s18, s13, s16 +; GCN-NEXT: s_mul_i32 s16, s13, s16 ; GCN-NEXT: s_add_u32 s15, s15, s16 -; GCN-NEXT: s_mul_hi_u32 s17, s11, s13 +; GCN-NEXT: s_mul_hi_u32 s17, s13, s11 ; GCN-NEXT: s_addc_u32 s14, s14, s18 ; GCN-NEXT: s_addc_u32 s15, s17, 0 -; GCN-NEXT: s_mul_i32 s13, s11, s13 -; GCN-NEXT: s_add_u32 s13, s14, s13 +; GCN-NEXT: s_mul_i32 s11, s13, s11 +; GCN-NEXT: s_add_u32 s11, s14, s11 ; GCN-NEXT: s_addc_u32 s14, 0, s15 -; GCN-NEXT: s_add_u32 s12, s12, s13 -; GCN-NEXT: s_addc_u32 s11, s11, s14 -; GCN-NEXT: s_mul_i32 s13, s3, s11 -; GCN-NEXT: s_mul_hi_u32 s14, s3, s12 -; GCN-NEXT: s_add_i32 s13, s14, s13 -; GCN-NEXT: s_mul_i32 s10, s10, s12 -; GCN-NEXT: s_add_i32 s13, s13, s10 -; GCN-NEXT: s_mul_i32 s3, s3, s12 -; GCN-NEXT: s_mul_hi_u32 s14, s11, s3 -; GCN-NEXT: s_mul_i32 s15, s11, s3 -; GCN-NEXT: s_mul_i32 s17, s12, s13 -; GCN-NEXT: s_mul_hi_u32 s3, s12, s3 -; GCN-NEXT: s_mul_hi_u32 s16, s12, s13 +; GCN-NEXT: s_add_u32 s15, s10, s11 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GCN-NEXT: s_addc_u32 s13, s13, s14 +; GCN-NEXT: s_mul_i32 s10, s3, s13 +; GCN-NEXT: s_mul_hi_u32 s11, s3, s15 +; GCN-NEXT: s_add_i32 s10, s11, s10 +; GCN-NEXT: s_mul_i32 s12, s12, s15 +; GCN-NEXT: s_add_i32 s10, s10, s12 +; GCN-NEXT: s_mul_i32 s3, s3, s15 +; GCN-NEXT: s_mul_hi_u32 s12, s13, s3 +; GCN-NEXT: s_mul_i32 s14, s13, s3 +; GCN-NEXT: s_mul_i32 s17, s15, s10 +; GCN-NEXT: s_mul_hi_u32 s3, s15, s3 +; GCN-NEXT: s_mul_hi_u32 s16, s15, s10 ; GCN-NEXT: s_add_u32 s3, s3, s17 ; GCN-NEXT: s_addc_u32 s16, 0, s16 -; GCN-NEXT: s_add_u32 s3, s3, s15 -; GCN-NEXT: s_mul_hi_u32 s10, s11, s13 -; GCN-NEXT: s_addc_u32 s3, s16, s14 -; GCN-NEXT: s_addc_u32 s10, s10, 0 -; GCN-NEXT: s_mul_i32 s13, s11, s13 -; GCN-NEXT: s_add_u32 s3, s3, s13 -; GCN-NEXT: s_addc_u32 s10, 0, s10 -; GCN-NEXT: s_add_u32 s3, s12, s3 -; GCN-NEXT: s_addc_u32 s14, s11, s10 +; GCN-NEXT: s_add_u32 s3, s3, s14 +; GCN-NEXT: s_mul_hi_u32 s11, s13, s10 +; GCN-NEXT: s_addc_u32 s3, s16, s12 +; GCN-NEXT: s_addc_u32 s11, s11, 0 +; GCN-NEXT: s_mul_i32 s10, s13, s10 +; GCN-NEXT: s_add_u32 s3, s3, s10 +; GCN-NEXT: s_addc_u32 s12, 0, s11 +; GCN-NEXT: s_add_u32 s3, s15, s3 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GCN-NEXT: s_addc_u32 s14, s13, s12 ; GCN-NEXT: s_ashr_i32 s10, s5, 31 ; GCN-NEXT: s_add_u32 s12, s4, s10 ; GCN-NEXT: s_mov_b32 s11, s10 @@ -1596,9 +1600,11 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: s_mul_i32 s3, s8, s3 ; GCN-NEXT: s_sub_u32 s3, s12, s3 ; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 ; GCN-NEXT: s_subb_u32 s12, s16, s9 ; GCN-NEXT: s_sub_u32 s18, s3, s8 ; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s19, s12, 0 ; GCN-NEXT: s_cmp_ge_u32 s19, s9 ; GCN-NEXT: s_cselect_b32 s20, -1, 0 @@ -1608,10 +1614,12 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: s_cselect_b32 s20, s21, s20 ; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s12, s12, s9 -; GCN-NEXT: s_sub_u32 s16, s18, s8 +; GCN-NEXT: s_sub_u32 s21, s18, s8 +; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s12, s12, 0 ; GCN-NEXT: s_cmp_lg_u32 s20, 0 -; GCN-NEXT: s_cselect_b32 s16, s16, s18 +; GCN-NEXT: s_cselect_b32 s16, s21, s18 ; GCN-NEXT: s_cselect_b32 s12, s12, s19 ; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 ; GCN-NEXT: s_subb_u32 s5, s13, s5 @@ -1923,9 +1931,11 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TONGA-NEXT: v_readfirstlane_b32 s14, v0 ; TONGA-NEXT: s_sub_u32 s12, s12, s14 ; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s3, s3, s7 ; TONGA-NEXT: s_sub_u32 s18, s12, s6 ; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s19, s3, 0 ; TONGA-NEXT: s_cmp_ge_u32 s19, s7 ; TONGA-NEXT: s_cselect_b32 s20, -1, 0 @@ -1935,10 +1945,12 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TONGA-NEXT: s_cselect_b32 s20, s21, s20 ; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s3, s3, s7 -; TONGA-NEXT: s_sub_u32 s16, s18, s6 +; TONGA-NEXT: s_sub_u32 s21, s18, s6 +; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s3, s3, 0 ; TONGA-NEXT: s_cmp_lg_u32 s20, 0 -; TONGA-NEXT: s_cselect_b32 s16, s16, s18 +; TONGA-NEXT: s_cselect_b32 s16, s21, s18 ; TONGA-NEXT: s_cselect_b32 s3, s3, s19 ; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s5, s13, s5 @@ -2718,7 +2730,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GCN-NEXT: s_sub_u32 s9, 0, s6 -; GCN-NEXT: s_subb_u32 s14, 0, s7 +; GCN-NEXT: s_subb_u32 s16, 0, s7 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2727,52 +2739,56 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s15, v1 -; GCN-NEXT: v_readfirstlane_b32 s16, v0 -; GCN-NEXT: s_mul_i32 s17, s9, s15 -; GCN-NEXT: s_mul_hi_u32 s19, s9, s16 -; GCN-NEXT: s_mul_i32 s18, s14, s16 -; GCN-NEXT: s_add_i32 s17, s19, s17 -; GCN-NEXT: s_add_i32 s17, s17, s18 -; GCN-NEXT: s_mul_i32 s20, s9, s16 -; GCN-NEXT: s_mul_i32 s19, s16, s17 -; GCN-NEXT: s_mul_hi_u32 s21, s16, s20 -; GCN-NEXT: s_mul_hi_u32 s18, s16, s17 +; GCN-NEXT: v_readfirstlane_b32 s17, v1 +; GCN-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-NEXT: s_mul_i32 s15, s9, s17 +; GCN-NEXT: s_mul_hi_u32 s19, s9, s14 +; GCN-NEXT: s_mul_i32 s18, s16, s14 +; GCN-NEXT: s_add_i32 s15, s19, s15 +; GCN-NEXT: s_add_i32 s15, s15, s18 +; GCN-NEXT: s_mul_i32 s20, s9, s14 +; GCN-NEXT: s_mul_i32 s19, s14, s15 +; GCN-NEXT: s_mul_hi_u32 s21, s14, s20 +; GCN-NEXT: s_mul_hi_u32 s18, s14, s15 ; GCN-NEXT: s_add_u32 s19, s21, s19 ; GCN-NEXT: s_addc_u32 s18, 0, s18 -; GCN-NEXT: s_mul_hi_u32 s22, s15, s20 -; GCN-NEXT: s_mul_i32 s20, s15, s20 +; GCN-NEXT: s_mul_hi_u32 s22, s17, s20 +; GCN-NEXT: s_mul_i32 s20, s17, s20 ; GCN-NEXT: s_add_u32 s19, s19, s20 -; GCN-NEXT: s_mul_hi_u32 s21, s15, s17 +; GCN-NEXT: s_mul_hi_u32 s21, s17, s15 ; GCN-NEXT: s_addc_u32 s18, s18, s22 ; GCN-NEXT: s_addc_u32 s19, s21, 0 -; GCN-NEXT: s_mul_i32 s17, s15, s17 -; GCN-NEXT: s_add_u32 s17, s18, s17 +; GCN-NEXT: s_mul_i32 s15, s17, s15 +; GCN-NEXT: s_add_u32 s15, s18, s15 ; GCN-NEXT: s_addc_u32 s18, 0, s19 -; GCN-NEXT: s_add_u32 s16, s16, s17 -; GCN-NEXT: s_addc_u32 s15, s15, s18 -; GCN-NEXT: s_mul_i32 s17, s9, s15 -; GCN-NEXT: s_mul_hi_u32 s18, s9, s16 -; GCN-NEXT: s_add_i32 s17, s18, s17 -; GCN-NEXT: s_mul_i32 s14, s14, s16 -; GCN-NEXT: s_add_i32 s17, s17, s14 -; GCN-NEXT: s_mul_i32 s9, s9, s16 -; GCN-NEXT: s_mul_hi_u32 s18, s15, s9 -; GCN-NEXT: s_mul_i32 s19, s15, s9 -; GCN-NEXT: s_mul_i32 s21, s16, s17 -; GCN-NEXT: s_mul_hi_u32 s9, s16, s9 -; GCN-NEXT: s_mul_hi_u32 s20, s16, s17 +; GCN-NEXT: s_add_u32 s19, s14, s15 +; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GCN-NEXT: s_addc_u32 s17, s17, s18 +; GCN-NEXT: s_mul_i32 s14, s9, s17 +; GCN-NEXT: s_mul_hi_u32 s15, s9, s19 +; GCN-NEXT: s_add_i32 s14, s15, s14 +; GCN-NEXT: s_mul_i32 s16, s16, s19 +; GCN-NEXT: s_add_i32 s14, s14, s16 +; GCN-NEXT: s_mul_i32 s9, s9, s19 +; GCN-NEXT: s_mul_hi_u32 s16, s17, s9 +; GCN-NEXT: s_mul_i32 s18, s17, s9 +; GCN-NEXT: s_mul_i32 s21, s19, s14 +; GCN-NEXT: s_mul_hi_u32 s9, s19, s9 +; GCN-NEXT: s_mul_hi_u32 s20, s19, s14 ; GCN-NEXT: s_add_u32 s9, s9, s21 ; GCN-NEXT: s_addc_u32 s20, 0, s20 -; GCN-NEXT: s_add_u32 s9, s9, s19 -; GCN-NEXT: s_mul_hi_u32 s14, s15, s17 -; GCN-NEXT: s_addc_u32 s9, s20, s18 -; GCN-NEXT: s_addc_u32 s14, s14, 0 -; GCN-NEXT: s_mul_i32 s17, s15, s17 -; GCN-NEXT: s_add_u32 s9, s9, s17 -; GCN-NEXT: s_addc_u32 s14, 0, s14 -; GCN-NEXT: s_add_u32 s9, s16, s9 -; GCN-NEXT: s_addc_u32 s18, s15, s14 +; GCN-NEXT: s_add_u32 s9, s9, s18 +; GCN-NEXT: s_mul_hi_u32 s15, s17, s14 +; GCN-NEXT: s_addc_u32 s9, s20, s16 +; GCN-NEXT: s_addc_u32 s15, s15, 0 +; GCN-NEXT: s_mul_i32 s14, s17, s14 +; GCN-NEXT: s_add_u32 s9, s9, s14 +; GCN-NEXT: s_addc_u32 s16, 0, s15 +; GCN-NEXT: s_add_u32 s9, s19, s9 +; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GCN-NEXT: s_addc_u32 s18, s17, s16 ; GCN-NEXT: s_ashr_i32 s14, s11, 31 ; GCN-NEXT: s_add_u32 s16, s10, s14 ; GCN-NEXT: s_mov_b32 s15, s14 @@ -2801,9 +2817,11 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s9, s6, s9 ; GCN-NEXT: s_sub_u32 s9, s16, s9 ; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s16, s20, s7 ; GCN-NEXT: s_sub_u32 s22, s9, s6 ; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s23, s16, 0 ; GCN-NEXT: s_cmp_ge_u32 s23, s7 ; GCN-NEXT: s_cselect_b32 s24, -1, 0 @@ -2813,10 +2831,12 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s24, s25, s24 ; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s16, s16, s7 -; GCN-NEXT: s_sub_u32 s20, s22, s6 +; GCN-NEXT: s_sub_u32 s25, s22, s6 +; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s16, s16, 0 ; GCN-NEXT: s_cmp_lg_u32 s24, 0 -; GCN-NEXT: s_cselect_b32 s20, s20, s22 +; GCN-NEXT: s_cselect_b32 s20, s25, s22 ; GCN-NEXT: s_cselect_b32 s16, s16, s23 ; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s11, s17, s11 @@ -2867,7 +2887,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GCN-NEXT: s_sub_u32 s3, 0, s10 -; GCN-NEXT: s_subb_u32 s12, 0, s11 +; GCN-NEXT: s_subb_u32 s14, 0, s11 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2876,52 +2896,56 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s13, v1 -; GCN-NEXT: v_readfirstlane_b32 s14, v0 -; GCN-NEXT: s_mul_i32 s15, s3, s13 -; GCN-NEXT: s_mul_hi_u32 s17, s3, s14 -; GCN-NEXT: s_mul_i32 s16, s12, s14 -; GCN-NEXT: s_add_i32 s15, s17, s15 -; GCN-NEXT: s_add_i32 s15, s15, s16 -; GCN-NEXT: s_mul_i32 s18, s3, s14 -; GCN-NEXT: s_mul_i32 s17, s14, s15 -; GCN-NEXT: s_mul_hi_u32 s19, s14, s18 -; GCN-NEXT: s_mul_hi_u32 s16, s14, s15 +; GCN-NEXT: v_readfirstlane_b32 s15, v1 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_mul_i32 s13, s3, s15 +; GCN-NEXT: s_mul_hi_u32 s17, s3, s12 +; GCN-NEXT: s_mul_i32 s16, s14, s12 +; GCN-NEXT: s_add_i32 s13, s17, s13 +; GCN-NEXT: s_add_i32 s13, s13, s16 +; GCN-NEXT: s_mul_i32 s18, s3, s12 +; GCN-NEXT: s_mul_i32 s17, s12, s13 +; GCN-NEXT: s_mul_hi_u32 s19, s12, s18 +; GCN-NEXT: s_mul_hi_u32 s16, s12, s13 ; GCN-NEXT: s_add_u32 s17, s19, s17 ; GCN-NEXT: s_addc_u32 s16, 0, s16 -; GCN-NEXT: s_mul_hi_u32 s20, s13, s18 -; GCN-NEXT: s_mul_i32 s18, s13, s18 +; GCN-NEXT: s_mul_hi_u32 s20, s15, s18 +; GCN-NEXT: s_mul_i32 s18, s15, s18 ; GCN-NEXT: s_add_u32 s17, s17, s18 -; GCN-NEXT: s_mul_hi_u32 s19, s13, s15 +; GCN-NEXT: s_mul_hi_u32 s19, s15, s13 ; GCN-NEXT: s_addc_u32 s16, s16, s20 ; GCN-NEXT: s_addc_u32 s17, s19, 0 -; GCN-NEXT: s_mul_i32 s15, s13, s15 -; GCN-NEXT: s_add_u32 s15, s16, s15 +; GCN-NEXT: s_mul_i32 s13, s15, s13 +; GCN-NEXT: s_add_u32 s13, s16, s13 ; GCN-NEXT: s_addc_u32 s16, 0, s17 -; GCN-NEXT: s_add_u32 s14, s14, s15 -; GCN-NEXT: s_addc_u32 s13, s13, s16 -; GCN-NEXT: s_mul_i32 s15, s3, s13 -; GCN-NEXT: s_mul_hi_u32 s16, s3, s14 -; GCN-NEXT: s_add_i32 s15, s16, s15 -; GCN-NEXT: s_mul_i32 s12, s12, s14 -; GCN-NEXT: s_add_i32 s15, s15, s12 -; GCN-NEXT: s_mul_i32 s3, s3, s14 -; GCN-NEXT: s_mul_hi_u32 s16, s13, s3 -; GCN-NEXT: s_mul_i32 s17, s13, s3 -; GCN-NEXT: s_mul_i32 s19, s14, s15 -; GCN-NEXT: s_mul_hi_u32 s3, s14, s3 -; GCN-NEXT: s_mul_hi_u32 s18, s14, s15 +; GCN-NEXT: s_add_u32 s17, s12, s13 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GCN-NEXT: s_addc_u32 s15, s15, s16 +; GCN-NEXT: s_mul_i32 s12, s3, s15 +; GCN-NEXT: s_mul_hi_u32 s13, s3, s17 +; GCN-NEXT: s_add_i32 s12, s13, s12 +; GCN-NEXT: s_mul_i32 s14, s14, s17 +; GCN-NEXT: s_add_i32 s12, s12, s14 +; GCN-NEXT: s_mul_i32 s3, s3, s17 +; GCN-NEXT: s_mul_hi_u32 s14, s15, s3 +; GCN-NEXT: s_mul_i32 s16, s15, s3 +; GCN-NEXT: s_mul_i32 s19, s17, s12 +; GCN-NEXT: s_mul_hi_u32 s3, s17, s3 +; GCN-NEXT: s_mul_hi_u32 s18, s17, s12 ; GCN-NEXT: s_add_u32 s3, s3, s19 ; GCN-NEXT: s_addc_u32 s18, 0, s18 -; GCN-NEXT: s_add_u32 s3, s3, s17 -; GCN-NEXT: s_mul_hi_u32 s12, s13, s15 -; GCN-NEXT: s_addc_u32 s3, s18, s16 -; GCN-NEXT: s_addc_u32 s12, s12, 0 -; GCN-NEXT: s_mul_i32 s15, s13, s15 -; GCN-NEXT: s_add_u32 s3, s3, s15 -; GCN-NEXT: s_addc_u32 s12, 0, s12 -; GCN-NEXT: s_add_u32 s3, s14, s3 -; GCN-NEXT: s_addc_u32 s16, s13, s12 +; GCN-NEXT: s_add_u32 s3, s3, s16 +; GCN-NEXT: s_mul_hi_u32 s13, s15, s12 +; GCN-NEXT: s_addc_u32 s3, s18, s14 +; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_mul_i32 s12, s15, s12 +; GCN-NEXT: s_add_u32 s3, s3, s12 +; GCN-NEXT: s_addc_u32 s14, 0, s13 +; GCN-NEXT: s_add_u32 s3, s17, s3 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GCN-NEXT: s_addc_u32 s16, s15, s14 ; GCN-NEXT: s_ashr_i32 s12, s5, 31 ; GCN-NEXT: s_add_u32 s14, s4, s12 ; GCN-NEXT: s_mov_b32 s13, s12 @@ -2950,9 +2974,11 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s3, s10, s3 ; GCN-NEXT: s_sub_u32 s3, s14, s3 ; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s14, s18, s11 ; GCN-NEXT: s_sub_u32 s20, s3, s10 ; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s21, s14, 0 ; GCN-NEXT: s_cmp_ge_u32 s21, s11 ; GCN-NEXT: s_cselect_b32 s22, -1, 0 @@ -2962,10 +2988,12 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s22, s23, s22 ; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s14, s14, s11 -; GCN-NEXT: s_sub_u32 s18, s20, s10 +; GCN-NEXT: s_sub_u32 s23, s20, s10 +; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s14, s14, 0 ; GCN-NEXT: s_cmp_lg_u32 s22, 0 -; GCN-NEXT: s_cselect_b32 s18, s18, s20 +; GCN-NEXT: s_cselect_b32 s18, s23, s20 ; GCN-NEXT: s_cselect_b32 s14, s14, s21 ; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s5, s15, s5 @@ -3435,9 +3463,11 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_readfirstlane_b32 s14, v0 ; TONGA-NEXT: s_sub_u32 s12, s12, s14 ; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, s7 ; TONGA-NEXT: s_sub_u32 s18, s12, s6 ; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s19, s1, 0 ; TONGA-NEXT: s_cmp_ge_u32 s19, s7 ; TONGA-NEXT: s_cselect_b32 s20, -1, 0 @@ -3447,10 +3477,12 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: s_cselect_b32 s20, s21, s20 ; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, s7 -; TONGA-NEXT: s_sub_u32 s16, s18, s6 +; TONGA-NEXT: s_sub_u32 s21, s18, s6 +; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, 0 ; TONGA-NEXT: s_cmp_lg_u32 s20, 0 -; TONGA-NEXT: s_cselect_b32 s16, s16, s18 +; TONGA-NEXT: s_cselect_b32 s16, s21, s18 ; TONGA-NEXT: s_cselect_b32 s1, s1, s19 ; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s3, s13, s3 @@ -4902,7 +4934,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GCN-NEXT: s_sub_u32 s17, 0, s6 -; GCN-NEXT: s_subb_u32 s22, 0, s7 +; GCN-NEXT: s_subb_u32 s24, 0, s7 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -4911,52 +4943,56 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s23, v1 -; GCN-NEXT: v_readfirstlane_b32 s24, v0 -; GCN-NEXT: s_mul_i32 s25, s17, s23 -; GCN-NEXT: s_mul_hi_u32 s27, s17, s24 -; GCN-NEXT: s_mul_i32 s26, s22, s24 -; GCN-NEXT: s_add_i32 s25, s27, s25 -; GCN-NEXT: s_add_i32 s25, s25, s26 -; GCN-NEXT: s_mul_i32 s28, s17, s24 -; GCN-NEXT: s_mul_i32 s27, s24, s25 -; GCN-NEXT: s_mul_hi_u32 s29, s24, s28 -; GCN-NEXT: s_mul_hi_u32 s26, s24, s25 +; GCN-NEXT: v_readfirstlane_b32 s25, v1 +; GCN-NEXT: v_readfirstlane_b32 s22, v0 +; GCN-NEXT: s_mul_i32 s23, s17, s25 +; GCN-NEXT: s_mul_hi_u32 s27, s17, s22 +; GCN-NEXT: s_mul_i32 s26, s24, s22 +; GCN-NEXT: s_add_i32 s23, s27, s23 +; GCN-NEXT: s_add_i32 s23, s23, s26 +; GCN-NEXT: s_mul_i32 s28, s17, s22 +; GCN-NEXT: s_mul_i32 s27, s22, s23 +; GCN-NEXT: s_mul_hi_u32 s29, s22, s28 +; GCN-NEXT: s_mul_hi_u32 s26, s22, s23 ; GCN-NEXT: s_add_u32 s27, s29, s27 ; GCN-NEXT: s_addc_u32 s26, 0, s26 -; GCN-NEXT: s_mul_hi_u32 s30, s23, s28 -; GCN-NEXT: s_mul_i32 s28, s23, s28 +; GCN-NEXT: s_mul_hi_u32 s30, s25, s28 +; GCN-NEXT: s_mul_i32 s28, s25, s28 ; GCN-NEXT: s_add_u32 s27, s27, s28 -; GCN-NEXT: s_mul_hi_u32 s29, s23, s25 +; GCN-NEXT: s_mul_hi_u32 s29, s25, s23 ; GCN-NEXT: s_addc_u32 s26, s26, s30 ; GCN-NEXT: s_addc_u32 s27, s29, 0 -; GCN-NEXT: s_mul_i32 s25, s23, s25 -; GCN-NEXT: s_add_u32 s25, s26, s25 +; GCN-NEXT: s_mul_i32 s23, s25, s23 +; GCN-NEXT: s_add_u32 s23, s26, s23 ; GCN-NEXT: s_addc_u32 s26, 0, s27 -; GCN-NEXT: s_add_u32 s24, s24, s25 -; GCN-NEXT: s_addc_u32 s23, s23, s26 -; GCN-NEXT: s_mul_i32 s25, s17, s23 -; GCN-NEXT: s_mul_hi_u32 s26, s17, s24 -; GCN-NEXT: s_add_i32 s25, s26, s25 -; GCN-NEXT: s_mul_i32 s22, s22, s24 -; GCN-NEXT: s_add_i32 s25, s25, s22 -; GCN-NEXT: s_mul_i32 s17, s17, s24 -; GCN-NEXT: s_mul_hi_u32 s26, s23, s17 -; GCN-NEXT: s_mul_i32 s27, s23, s17 -; GCN-NEXT: s_mul_i32 s29, s24, s25 -; GCN-NEXT: s_mul_hi_u32 s17, s24, s17 -; GCN-NEXT: s_mul_hi_u32 s28, s24, s25 +; GCN-NEXT: s_add_u32 s27, s22, s23 +; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 +; GCN-NEXT: s_addc_u32 s25, s25, s26 +; GCN-NEXT: s_mul_i32 s22, s17, s25 +; GCN-NEXT: s_mul_hi_u32 s23, s17, s27 +; GCN-NEXT: s_add_i32 s22, s23, s22 +; GCN-NEXT: s_mul_i32 s24, s24, s27 +; GCN-NEXT: s_add_i32 s22, s22, s24 +; GCN-NEXT: s_mul_i32 s17, s17, s27 +; GCN-NEXT: s_mul_hi_u32 s24, s25, s17 +; GCN-NEXT: s_mul_i32 s26, s25, s17 +; GCN-NEXT: s_mul_i32 s29, s27, s22 +; GCN-NEXT: s_mul_hi_u32 s17, s27, s17 +; GCN-NEXT: s_mul_hi_u32 s28, s27, s22 ; GCN-NEXT: s_add_u32 s17, s17, s29 ; GCN-NEXT: s_addc_u32 s28, 0, s28 -; GCN-NEXT: s_add_u32 s17, s17, s27 -; GCN-NEXT: s_mul_hi_u32 s22, s23, s25 -; GCN-NEXT: s_addc_u32 s17, s28, s26 -; GCN-NEXT: s_addc_u32 s22, s22, 0 -; GCN-NEXT: s_mul_i32 s25, s23, s25 -; GCN-NEXT: s_add_u32 s17, s17, s25 -; GCN-NEXT: s_addc_u32 s22, 0, s22 -; GCN-NEXT: s_add_u32 s17, s24, s17 -; GCN-NEXT: s_addc_u32 s26, s23, s22 +; GCN-NEXT: s_add_u32 s17, s17, s26 +; GCN-NEXT: s_mul_hi_u32 s23, s25, s22 +; GCN-NEXT: s_addc_u32 s17, s28, s24 +; GCN-NEXT: s_addc_u32 s23, s23, 0 +; GCN-NEXT: s_mul_i32 s22, s25, s22 +; GCN-NEXT: s_add_u32 s17, s17, s22 +; GCN-NEXT: s_addc_u32 s24, 0, s23 +; GCN-NEXT: s_add_u32 s17, s27, s17 +; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 +; GCN-NEXT: s_addc_u32 s26, s25, s24 ; GCN-NEXT: s_ashr_i32 s22, s19, 31 ; GCN-NEXT: s_add_u32 s24, s18, s22 ; GCN-NEXT: s_mov_b32 s23, s22 @@ -4985,9 +5021,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s17, s6, s17 ; GCN-NEXT: s_sub_u32 s17, s24, s17 ; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s24, s28, s7 ; GCN-NEXT: s_sub_u32 s30, s17, s6 ; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0 ; GCN-NEXT: s_subb_u32 s31, s24, 0 ; GCN-NEXT: s_cmp_ge_u32 s31, s7 ; GCN-NEXT: s_cselect_b32 s33, -1, 0 @@ -4997,10 +5035,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s33, s34, s33 ; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0 ; GCN-NEXT: s_subb_u32 s24, s24, s7 -; GCN-NEXT: s_sub_u32 s28, s30, s6 +; GCN-NEXT: s_sub_u32 s34, s30, s6 +; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0 ; GCN-NEXT: s_subb_u32 s24, s24, 0 ; GCN-NEXT: s_cmp_lg_u32 s33, 0 -; GCN-NEXT: s_cselect_b32 s28, s28, s30 +; GCN-NEXT: s_cselect_b32 s28, s34, s30 ; GCN-NEXT: s_cselect_b32 s24, s24, s31 ; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s19, s25, s19 @@ -5051,7 +5091,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s18 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s19 ; GCN-NEXT: s_sub_u32 s13, 0, s18 -; GCN-NEXT: s_subb_u32 s20, 0, s19 +; GCN-NEXT: s_subb_u32 s22, 0, s19 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -5060,52 +5100,56 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s21, v1 -; GCN-NEXT: v_readfirstlane_b32 s22, v0 -; GCN-NEXT: s_mul_i32 s23, s13, s21 -; GCN-NEXT: s_mul_hi_u32 s25, s13, s22 -; GCN-NEXT: s_mul_i32 s24, s20, s22 -; GCN-NEXT: s_add_i32 s23, s25, s23 -; GCN-NEXT: s_add_i32 s23, s23, s24 -; GCN-NEXT: s_mul_i32 s26, s13, s22 -; GCN-NEXT: s_mul_i32 s25, s22, s23 -; GCN-NEXT: s_mul_hi_u32 s27, s22, s26 -; GCN-NEXT: s_mul_hi_u32 s24, s22, s23 +; GCN-NEXT: v_readfirstlane_b32 s23, v1 +; GCN-NEXT: v_readfirstlane_b32 s20, v0 +; GCN-NEXT: s_mul_i32 s21, s13, s23 +; GCN-NEXT: s_mul_hi_u32 s25, s13, s20 +; GCN-NEXT: s_mul_i32 s24, s22, s20 +; GCN-NEXT: s_add_i32 s21, s25, s21 +; GCN-NEXT: s_add_i32 s21, s21, s24 +; GCN-NEXT: s_mul_i32 s26, s13, s20 +; GCN-NEXT: s_mul_i32 s25, s20, s21 +; GCN-NEXT: s_mul_hi_u32 s27, s20, s26 +; GCN-NEXT: s_mul_hi_u32 s24, s20, s21 ; GCN-NEXT: s_add_u32 s25, s27, s25 ; GCN-NEXT: s_addc_u32 s24, 0, s24 -; GCN-NEXT: s_mul_hi_u32 s28, s21, s26 -; GCN-NEXT: s_mul_i32 s26, s21, s26 +; GCN-NEXT: s_mul_hi_u32 s28, s23, s26 +; GCN-NEXT: s_mul_i32 s26, s23, s26 ; GCN-NEXT: s_add_u32 s25, s25, s26 -; GCN-NEXT: s_mul_hi_u32 s27, s21, s23 +; GCN-NEXT: s_mul_hi_u32 s27, s23, s21 ; GCN-NEXT: s_addc_u32 s24, s24, s28 ; GCN-NEXT: s_addc_u32 s25, s27, 0 -; GCN-NEXT: s_mul_i32 s23, s21, s23 -; GCN-NEXT: s_add_u32 s23, s24, s23 +; GCN-NEXT: s_mul_i32 s21, s23, s21 +; GCN-NEXT: s_add_u32 s21, s24, s21 ; GCN-NEXT: s_addc_u32 s24, 0, s25 -; GCN-NEXT: s_add_u32 s22, s22, s23 -; GCN-NEXT: s_addc_u32 s21, s21, s24 -; GCN-NEXT: s_mul_i32 s23, s13, s21 -; GCN-NEXT: s_mul_hi_u32 s24, s13, s22 -; GCN-NEXT: s_add_i32 s23, s24, s23 -; GCN-NEXT: s_mul_i32 s20, s20, s22 -; GCN-NEXT: s_add_i32 s23, s23, s20 -; GCN-NEXT: s_mul_i32 s13, s13, s22 -; GCN-NEXT: s_mul_hi_u32 s24, s21, s13 -; GCN-NEXT: s_mul_i32 s25, s21, s13 -; GCN-NEXT: s_mul_i32 s27, s22, s23 -; GCN-NEXT: s_mul_hi_u32 s13, s22, s13 -; GCN-NEXT: s_mul_hi_u32 s26, s22, s23 +; GCN-NEXT: s_add_u32 s25, s20, s21 +; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 +; GCN-NEXT: s_addc_u32 s23, s23, s24 +; GCN-NEXT: s_mul_i32 s20, s13, s23 +; GCN-NEXT: s_mul_hi_u32 s21, s13, s25 +; GCN-NEXT: s_add_i32 s20, s21, s20 +; GCN-NEXT: s_mul_i32 s22, s22, s25 +; GCN-NEXT: s_add_i32 s20, s20, s22 +; GCN-NEXT: s_mul_i32 s13, s13, s25 +; GCN-NEXT: s_mul_hi_u32 s22, s23, s13 +; GCN-NEXT: s_mul_i32 s24, s23, s13 +; GCN-NEXT: s_mul_i32 s27, s25, s20 +; GCN-NEXT: s_mul_hi_u32 s13, s25, s13 +; GCN-NEXT: s_mul_hi_u32 s26, s25, s20 ; GCN-NEXT: s_add_u32 s13, s13, s27 ; GCN-NEXT: s_addc_u32 s26, 0, s26 -; GCN-NEXT: s_add_u32 s13, s13, s25 -; GCN-NEXT: s_mul_hi_u32 s20, s21, s23 -; GCN-NEXT: s_addc_u32 s13, s26, s24 -; GCN-NEXT: s_addc_u32 s20, s20, 0 -; GCN-NEXT: s_mul_i32 s23, s21, s23 -; GCN-NEXT: s_add_u32 s13, s13, s23 -; GCN-NEXT: s_addc_u32 s20, 0, s20 -; GCN-NEXT: s_add_u32 s13, s22, s13 -; GCN-NEXT: s_addc_u32 s24, s21, s20 +; GCN-NEXT: s_add_u32 s13, s13, s24 +; GCN-NEXT: s_mul_hi_u32 s21, s23, s20 +; GCN-NEXT: s_addc_u32 s13, s26, s22 +; GCN-NEXT: s_addc_u32 s21, s21, 0 +; GCN-NEXT: s_mul_i32 s20, s23, s20 +; GCN-NEXT: s_add_u32 s13, s13, s20 +; GCN-NEXT: s_addc_u32 s22, 0, s21 +; GCN-NEXT: s_add_u32 s13, s25, s13 +; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 +; GCN-NEXT: s_addc_u32 s24, s23, s22 ; GCN-NEXT: s_ashr_i32 s20, s15, 31 ; GCN-NEXT: s_add_u32 s22, s14, s20 ; GCN-NEXT: s_mov_b32 s21, s20 @@ -5134,9 +5178,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s13, s18, s13 ; GCN-NEXT: s_sub_u32 s13, s22, s13 ; GCN-NEXT: s_cselect_b64 s[24:25], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0 ; GCN-NEXT: s_subb_u32 s22, s26, s19 ; GCN-NEXT: s_sub_u32 s28, s13, s18 ; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s29, s22, 0 ; GCN-NEXT: s_cmp_ge_u32 s29, s19 ; GCN-NEXT: s_cselect_b32 s30, -1, 0 @@ -5146,10 +5192,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s30, s31, s30 ; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s22, s22, s19 -; GCN-NEXT: s_sub_u32 s26, s28, s18 +; GCN-NEXT: s_sub_u32 s31, s28, s18 +; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s22, s22, 0 ; GCN-NEXT: s_cmp_lg_u32 s30, 0 -; GCN-NEXT: s_cselect_b32 s26, s26, s28 +; GCN-NEXT: s_cselect_b32 s26, s31, s28 ; GCN-NEXT: s_cselect_b32 s22, s22, s29 ; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0 ; GCN-NEXT: s_subb_u32 s15, s23, s15 @@ -5209,7 +5257,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s14 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s15 ; GCN-NEXT: s_sub_u32 s9, 0, s14 -; GCN-NEXT: s_subb_u32 s16, 0, s15 +; GCN-NEXT: s_subb_u32 s18, 0, s15 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -5218,52 +5266,56 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s17, v1 -; GCN-NEXT: v_readfirstlane_b32 s18, v0 -; GCN-NEXT: s_mul_i32 s19, s9, s17 -; GCN-NEXT: s_mul_hi_u32 s21, s9, s18 -; GCN-NEXT: s_mul_i32 s20, s16, s18 -; GCN-NEXT: s_add_i32 s19, s21, s19 -; GCN-NEXT: s_add_i32 s19, s19, s20 -; GCN-NEXT: s_mul_i32 s22, s9, s18 -; GCN-NEXT: s_mul_i32 s21, s18, s19 -; GCN-NEXT: s_mul_hi_u32 s23, s18, s22 -; GCN-NEXT: s_mul_hi_u32 s20, s18, s19 +; GCN-NEXT: v_readfirstlane_b32 s19, v1 +; GCN-NEXT: v_readfirstlane_b32 s16, v0 +; GCN-NEXT: s_mul_i32 s17, s9, s19 +; GCN-NEXT: s_mul_hi_u32 s21, s9, s16 +; GCN-NEXT: s_mul_i32 s20, s18, s16 +; GCN-NEXT: s_add_i32 s17, s21, s17 +; GCN-NEXT: s_add_i32 s17, s17, s20 +; GCN-NEXT: s_mul_i32 s22, s9, s16 +; GCN-NEXT: s_mul_i32 s21, s16, s17 +; GCN-NEXT: s_mul_hi_u32 s23, s16, s22 +; GCN-NEXT: s_mul_hi_u32 s20, s16, s17 ; GCN-NEXT: s_add_u32 s21, s23, s21 ; GCN-NEXT: s_addc_u32 s20, 0, s20 -; GCN-NEXT: s_mul_hi_u32 s24, s17, s22 -; GCN-NEXT: s_mul_i32 s22, s17, s22 +; GCN-NEXT: s_mul_hi_u32 s24, s19, s22 +; GCN-NEXT: s_mul_i32 s22, s19, s22 ; GCN-NEXT: s_add_u32 s21, s21, s22 -; GCN-NEXT: s_mul_hi_u32 s23, s17, s19 +; GCN-NEXT: s_mul_hi_u32 s23, s19, s17 ; GCN-NEXT: s_addc_u32 s20, s20, s24 ; GCN-NEXT: s_addc_u32 s21, s23, 0 -; GCN-NEXT: s_mul_i32 s19, s17, s19 -; GCN-NEXT: s_add_u32 s19, s20, s19 +; GCN-NEXT: s_mul_i32 s17, s19, s17 +; GCN-NEXT: s_add_u32 s17, s20, s17 ; GCN-NEXT: s_addc_u32 s20, 0, s21 -; GCN-NEXT: s_add_u32 s18, s18, s19 -; GCN-NEXT: s_addc_u32 s17, s17, s20 -; GCN-NEXT: s_mul_i32 s19, s9, s17 -; GCN-NEXT: s_mul_hi_u32 s20, s9, s18 -; GCN-NEXT: s_add_i32 s19, s20, s19 -; GCN-NEXT: s_mul_i32 s16, s16, s18 -; GCN-NEXT: s_add_i32 s19, s19, s16 -; GCN-NEXT: s_mul_i32 s9, s9, s18 -; GCN-NEXT: s_mul_hi_u32 s20, s17, s9 -; GCN-NEXT: s_mul_i32 s21, s17, s9 -; GCN-NEXT: s_mul_i32 s23, s18, s19 -; GCN-NEXT: s_mul_hi_u32 s9, s18, s9 -; GCN-NEXT: s_mul_hi_u32 s22, s18, s19 +; GCN-NEXT: s_add_u32 s21, s16, s17 +; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GCN-NEXT: s_addc_u32 s19, s19, s20 +; GCN-NEXT: s_mul_i32 s16, s9, s19 +; GCN-NEXT: s_mul_hi_u32 s17, s9, s21 +; GCN-NEXT: s_add_i32 s16, s17, s16 +; GCN-NEXT: s_mul_i32 s18, s18, s21 +; GCN-NEXT: s_add_i32 s16, s16, s18 +; GCN-NEXT: s_mul_i32 s9, s9, s21 +; GCN-NEXT: s_mul_hi_u32 s18, s19, s9 +; GCN-NEXT: s_mul_i32 s20, s19, s9 +; GCN-NEXT: s_mul_i32 s23, s21, s16 +; GCN-NEXT: s_mul_hi_u32 s9, s21, s9 +; GCN-NEXT: s_mul_hi_u32 s22, s21, s16 ; GCN-NEXT: s_add_u32 s9, s9, s23 ; GCN-NEXT: s_addc_u32 s22, 0, s22 -; GCN-NEXT: s_add_u32 s9, s9, s21 -; GCN-NEXT: s_mul_hi_u32 s16, s17, s19 -; GCN-NEXT: s_addc_u32 s9, s22, s20 -; GCN-NEXT: s_addc_u32 s16, s16, 0 -; GCN-NEXT: s_mul_i32 s19, s17, s19 -; GCN-NEXT: s_add_u32 s9, s9, s19 -; GCN-NEXT: s_addc_u32 s16, 0, s16 -; GCN-NEXT: s_add_u32 s9, s18, s9 -; GCN-NEXT: s_addc_u32 s20, s17, s16 +; GCN-NEXT: s_add_u32 s9, s9, s20 +; GCN-NEXT: s_mul_hi_u32 s17, s19, s16 +; GCN-NEXT: s_addc_u32 s9, s22, s18 +; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_mul_i32 s16, s19, s16 +; GCN-NEXT: s_add_u32 s9, s9, s16 +; GCN-NEXT: s_addc_u32 s18, 0, s17 +; GCN-NEXT: s_add_u32 s9, s21, s9 +; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GCN-NEXT: s_addc_u32 s20, s19, s18 ; GCN-NEXT: s_ashr_i32 s16, s11, 31 ; GCN-NEXT: s_add_u32 s18, s10, s16 ; GCN-NEXT: s_mov_b32 s17, s16 @@ -5292,9 +5344,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s9, s14, s9 ; GCN-NEXT: s_sub_u32 s9, s18, s9 ; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s18, s22, s15 ; GCN-NEXT: s_sub_u32 s24, s9, s14 ; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 ; GCN-NEXT: s_subb_u32 s25, s18, 0 ; GCN-NEXT: s_cmp_ge_u32 s25, s15 ; GCN-NEXT: s_cselect_b32 s26, -1, 0 @@ -5304,10 +5358,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s26, s27, s26 ; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 ; GCN-NEXT: s_subb_u32 s18, s18, s15 -; GCN-NEXT: s_sub_u32 s22, s24, s14 +; GCN-NEXT: s_sub_u32 s27, s24, s14 +; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 ; GCN-NEXT: s_subb_u32 s18, s18, 0 ; GCN-NEXT: s_cmp_lg_u32 s26, 0 -; GCN-NEXT: s_cselect_b32 s22, s22, s24 +; GCN-NEXT: s_cselect_b32 s22, s27, s24 ; GCN-NEXT: s_cselect_b32 s18, s18, s25 ; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s11, s19, s11 @@ -5364,7 +5420,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GCN-NEXT: s_sub_u32 s3, 0, s10 -; GCN-NEXT: s_subb_u32 s12, 0, s11 +; GCN-NEXT: s_subb_u32 s14, 0, s11 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -5373,52 +5429,56 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s13, v1 -; GCN-NEXT: v_readfirstlane_b32 s14, v0 -; GCN-NEXT: s_mul_i32 s15, s3, s13 -; GCN-NEXT: s_mul_hi_u32 s17, s3, s14 -; GCN-NEXT: s_mul_i32 s16, s12, s14 -; GCN-NEXT: s_add_i32 s15, s17, s15 -; GCN-NEXT: s_add_i32 s15, s15, s16 -; GCN-NEXT: s_mul_i32 s18, s3, s14 -; GCN-NEXT: s_mul_i32 s17, s14, s15 -; GCN-NEXT: s_mul_hi_u32 s19, s14, s18 -; GCN-NEXT: s_mul_hi_u32 s16, s14, s15 +; GCN-NEXT: v_readfirstlane_b32 s15, v1 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_mul_i32 s13, s3, s15 +; GCN-NEXT: s_mul_hi_u32 s17, s3, s12 +; GCN-NEXT: s_mul_i32 s16, s14, s12 +; GCN-NEXT: s_add_i32 s13, s17, s13 +; GCN-NEXT: s_add_i32 s13, s13, s16 +; GCN-NEXT: s_mul_i32 s18, s3, s12 +; GCN-NEXT: s_mul_i32 s17, s12, s13 +; GCN-NEXT: s_mul_hi_u32 s19, s12, s18 +; GCN-NEXT: s_mul_hi_u32 s16, s12, s13 ; GCN-NEXT: s_add_u32 s17, s19, s17 ; GCN-NEXT: s_addc_u32 s16, 0, s16 -; GCN-NEXT: s_mul_hi_u32 s20, s13, s18 -; GCN-NEXT: s_mul_i32 s18, s13, s18 +; GCN-NEXT: s_mul_hi_u32 s20, s15, s18 +; GCN-NEXT: s_mul_i32 s18, s15, s18 ; GCN-NEXT: s_add_u32 s17, s17, s18 -; GCN-NEXT: s_mul_hi_u32 s19, s13, s15 +; GCN-NEXT: s_mul_hi_u32 s19, s15, s13 ; GCN-NEXT: s_addc_u32 s16, s16, s20 ; GCN-NEXT: s_addc_u32 s17, s19, 0 -; GCN-NEXT: s_mul_i32 s15, s13, s15 -; GCN-NEXT: s_add_u32 s15, s16, s15 +; GCN-NEXT: s_mul_i32 s13, s15, s13 +; GCN-NEXT: s_add_u32 s13, s16, s13 ; GCN-NEXT: s_addc_u32 s16, 0, s17 -; GCN-NEXT: s_add_u32 s14, s14, s15 -; GCN-NEXT: s_addc_u32 s13, s13, s16 -; GCN-NEXT: s_mul_i32 s15, s3, s13 -; GCN-NEXT: s_mul_hi_u32 s16, s3, s14 -; GCN-NEXT: s_add_i32 s15, s16, s15 -; GCN-NEXT: s_mul_i32 s12, s12, s14 -; GCN-NEXT: s_add_i32 s15, s15, s12 -; GCN-NEXT: s_mul_i32 s3, s3, s14 -; GCN-NEXT: s_mul_hi_u32 s16, s13, s3 -; GCN-NEXT: s_mul_i32 s17, s13, s3 -; GCN-NEXT: s_mul_i32 s19, s14, s15 -; GCN-NEXT: s_mul_hi_u32 s3, s14, s3 -; GCN-NEXT: s_mul_hi_u32 s18, s14, s15 +; GCN-NEXT: s_add_u32 s17, s12, s13 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GCN-NEXT: s_addc_u32 s15, s15, s16 +; GCN-NEXT: s_mul_i32 s12, s3, s15 +; GCN-NEXT: s_mul_hi_u32 s13, s3, s17 +; GCN-NEXT: s_add_i32 s12, s13, s12 +; GCN-NEXT: s_mul_i32 s14, s14, s17 +; GCN-NEXT: s_add_i32 s12, s12, s14 +; GCN-NEXT: s_mul_i32 s3, s3, s17 +; GCN-NEXT: s_mul_hi_u32 s14, s15, s3 +; GCN-NEXT: s_mul_i32 s16, s15, s3 +; GCN-NEXT: s_mul_i32 s19, s17, s12 +; GCN-NEXT: s_mul_hi_u32 s3, s17, s3 +; GCN-NEXT: s_mul_hi_u32 s18, s17, s12 ; GCN-NEXT: s_add_u32 s3, s3, s19 ; GCN-NEXT: s_addc_u32 s18, 0, s18 -; GCN-NEXT: s_add_u32 s3, s3, s17 -; GCN-NEXT: s_mul_hi_u32 s12, s13, s15 -; GCN-NEXT: s_addc_u32 s3, s18, s16 -; GCN-NEXT: s_addc_u32 s12, s12, 0 -; GCN-NEXT: s_mul_i32 s15, s13, s15 -; GCN-NEXT: s_add_u32 s3, s3, s15 -; GCN-NEXT: s_addc_u32 s12, 0, s12 -; GCN-NEXT: s_add_u32 s3, s14, s3 -; GCN-NEXT: s_addc_u32 s16, s13, s12 +; GCN-NEXT: s_add_u32 s3, s3, s16 +; GCN-NEXT: s_mul_hi_u32 s13, s15, s12 +; GCN-NEXT: s_addc_u32 s3, s18, s14 +; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_mul_i32 s12, s15, s12 +; GCN-NEXT: s_add_u32 s3, s3, s12 +; GCN-NEXT: s_addc_u32 s14, 0, s13 +; GCN-NEXT: s_add_u32 s3, s17, s3 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GCN-NEXT: s_addc_u32 s16, s15, s14 ; GCN-NEXT: s_ashr_i32 s12, s5, 31 ; GCN-NEXT: s_add_u32 s14, s4, s12 ; GCN-NEXT: s_mov_b32 s13, s12 @@ -5447,9 +5507,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s3, s10, s3 ; GCN-NEXT: s_sub_u32 s3, s14, s3 ; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s14, s18, s11 ; GCN-NEXT: s_sub_u32 s20, s3, s10 ; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s21, s14, 0 ; GCN-NEXT: s_cmp_ge_u32 s21, s11 ; GCN-NEXT: s_cselect_b32 s22, -1, 0 @@ -5459,10 +5521,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s22, s23, s22 ; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s14, s14, s11 -; GCN-NEXT: s_sub_u32 s18, s20, s10 +; GCN-NEXT: s_sub_u32 s23, s20, s10 +; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s14, s14, 0 ; GCN-NEXT: s_cmp_lg_u32 s22, 0 -; GCN-NEXT: s_cselect_b32 s18, s18, s20 +; GCN-NEXT: s_cselect_b32 s18, s23, s20 ; GCN-NEXT: s_cselect_b32 s14, s14, s21 ; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s5, s15, s5 @@ -6235,9 +6299,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_readfirstlane_b32 s14, v8 ; TONGA-NEXT: s_sub_u32 s12, s12, s14 ; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, s7 ; TONGA-NEXT: s_sub_u32 s18, s12, s6 ; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s19, s1, 0 ; TONGA-NEXT: s_cmp_ge_u32 s19, s7 ; TONGA-NEXT: s_cselect_b32 s20, -1, 0 @@ -6247,10 +6313,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: s_cselect_b32 s20, s21, s20 ; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, s7 -; TONGA-NEXT: s_sub_u32 s16, s18, s6 +; TONGA-NEXT: s_sub_u32 s21, s18, s6 +; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, 0 ; TONGA-NEXT: s_cmp_lg_u32 s20, 0 -; TONGA-NEXT: s_cselect_b32 s16, s16, s18 +; TONGA-NEXT: s_cselect_b32 s16, s21, s18 ; TONGA-NEXT: s_cselect_b32 s1, s1, s19 ; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s3, s13, s3 |