aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/srem.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/srem.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/srem.ll654
1 files changed, 361 insertions, 293 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index e12e31b..bbd1793 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -1513,7 +1513,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9
; GCN-NEXT: s_sub_u32 s3, 0, s8
-; GCN-NEXT: s_subb_u32 s10, 0, s9
+; GCN-NEXT: s_subb_u32 s12, 0, s9
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1522,52 +1522,56 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s11, v1
-; GCN-NEXT: v_readfirstlane_b32 s12, v0
-; GCN-NEXT: s_mul_i32 s13, s3, s11
-; GCN-NEXT: s_mul_hi_u32 s15, s3, s12
-; GCN-NEXT: s_mul_i32 s14, s10, s12
-; GCN-NEXT: s_add_i32 s13, s15, s13
-; GCN-NEXT: s_add_i32 s13, s13, s14
-; GCN-NEXT: s_mul_i32 s16, s3, s12
-; GCN-NEXT: s_mul_i32 s15, s12, s13
-; GCN-NEXT: s_mul_hi_u32 s17, s12, s16
-; GCN-NEXT: s_mul_hi_u32 s14, s12, s13
+; GCN-NEXT: v_readfirstlane_b32 s13, v1
+; GCN-NEXT: v_readfirstlane_b32 s10, v0
+; GCN-NEXT: s_mul_i32 s11, s3, s13
+; GCN-NEXT: s_mul_hi_u32 s15, s3, s10
+; GCN-NEXT: s_mul_i32 s14, s12, s10
+; GCN-NEXT: s_add_i32 s11, s15, s11
+; GCN-NEXT: s_add_i32 s11, s11, s14
+; GCN-NEXT: s_mul_i32 s16, s3, s10
+; GCN-NEXT: s_mul_i32 s15, s10, s11
+; GCN-NEXT: s_mul_hi_u32 s17, s10, s16
+; GCN-NEXT: s_mul_hi_u32 s14, s10, s11
; GCN-NEXT: s_add_u32 s15, s17, s15
; GCN-NEXT: s_addc_u32 s14, 0, s14
-; GCN-NEXT: s_mul_hi_u32 s18, s11, s16
-; GCN-NEXT: s_mul_i32 s16, s11, s16
+; GCN-NEXT: s_mul_hi_u32 s18, s13, s16
+; GCN-NEXT: s_mul_i32 s16, s13, s16
; GCN-NEXT: s_add_u32 s15, s15, s16
-; GCN-NEXT: s_mul_hi_u32 s17, s11, s13
+; GCN-NEXT: s_mul_hi_u32 s17, s13, s11
; GCN-NEXT: s_addc_u32 s14, s14, s18
; GCN-NEXT: s_addc_u32 s15, s17, 0
-; GCN-NEXT: s_mul_i32 s13, s11, s13
-; GCN-NEXT: s_add_u32 s13, s14, s13
+; GCN-NEXT: s_mul_i32 s11, s13, s11
+; GCN-NEXT: s_add_u32 s11, s14, s11
; GCN-NEXT: s_addc_u32 s14, 0, s15
-; GCN-NEXT: s_add_u32 s12, s12, s13
-; GCN-NEXT: s_addc_u32 s11, s11, s14
-; GCN-NEXT: s_mul_i32 s13, s3, s11
-; GCN-NEXT: s_mul_hi_u32 s14, s3, s12
-; GCN-NEXT: s_add_i32 s13, s14, s13
-; GCN-NEXT: s_mul_i32 s10, s10, s12
-; GCN-NEXT: s_add_i32 s13, s13, s10
-; GCN-NEXT: s_mul_i32 s3, s3, s12
-; GCN-NEXT: s_mul_hi_u32 s14, s11, s3
-; GCN-NEXT: s_mul_i32 s15, s11, s3
-; GCN-NEXT: s_mul_i32 s17, s12, s13
-; GCN-NEXT: s_mul_hi_u32 s3, s12, s3
-; GCN-NEXT: s_mul_hi_u32 s16, s12, s13
+; GCN-NEXT: s_add_u32 s15, s10, s11
+; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0
+; GCN-NEXT: s_addc_u32 s13, s13, s14
+; GCN-NEXT: s_mul_i32 s10, s3, s13
+; GCN-NEXT: s_mul_hi_u32 s11, s3, s15
+; GCN-NEXT: s_add_i32 s10, s11, s10
+; GCN-NEXT: s_mul_i32 s12, s12, s15
+; GCN-NEXT: s_add_i32 s10, s10, s12
+; GCN-NEXT: s_mul_i32 s3, s3, s15
+; GCN-NEXT: s_mul_hi_u32 s12, s13, s3
+; GCN-NEXT: s_mul_i32 s14, s13, s3
+; GCN-NEXT: s_mul_i32 s17, s15, s10
+; GCN-NEXT: s_mul_hi_u32 s3, s15, s3
+; GCN-NEXT: s_mul_hi_u32 s16, s15, s10
; GCN-NEXT: s_add_u32 s3, s3, s17
; GCN-NEXT: s_addc_u32 s16, 0, s16
-; GCN-NEXT: s_add_u32 s3, s3, s15
-; GCN-NEXT: s_mul_hi_u32 s10, s11, s13
-; GCN-NEXT: s_addc_u32 s3, s16, s14
-; GCN-NEXT: s_addc_u32 s10, s10, 0
-; GCN-NEXT: s_mul_i32 s13, s11, s13
-; GCN-NEXT: s_add_u32 s3, s3, s13
-; GCN-NEXT: s_addc_u32 s10, 0, s10
-; GCN-NEXT: s_add_u32 s3, s12, s3
-; GCN-NEXT: s_addc_u32 s14, s11, s10
+; GCN-NEXT: s_add_u32 s3, s3, s14
+; GCN-NEXT: s_mul_hi_u32 s11, s13, s10
+; GCN-NEXT: s_addc_u32 s3, s16, s12
+; GCN-NEXT: s_addc_u32 s11, s11, 0
+; GCN-NEXT: s_mul_i32 s10, s13, s10
+; GCN-NEXT: s_add_u32 s3, s3, s10
+; GCN-NEXT: s_addc_u32 s12, 0, s11
+; GCN-NEXT: s_add_u32 s3, s15, s3
+; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0
+; GCN-NEXT: s_addc_u32 s14, s13, s12
; GCN-NEXT: s_ashr_i32 s10, s5, 31
; GCN-NEXT: s_add_u32 s12, s4, s10
; GCN-NEXT: s_mov_b32 s11, s10
@@ -1596,9 +1600,11 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: s_mul_i32 s3, s8, s3
; GCN-NEXT: s_sub_u32 s3, s12, s3
; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
; GCN-NEXT: s_subb_u32 s12, s16, s9
; GCN-NEXT: s_sub_u32 s18, s3, s8
; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s19, s12, 0
; GCN-NEXT: s_cmp_ge_u32 s19, s9
; GCN-NEXT: s_cselect_b32 s20, -1, 0
@@ -1608,10 +1614,12 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: s_cselect_b32 s20, s21, s20
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s12, s12, s9
-; GCN-NEXT: s_sub_u32 s16, s18, s8
+; GCN-NEXT: s_sub_u32 s21, s18, s8
+; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s12, s12, 0
; GCN-NEXT: s_cmp_lg_u32 s20, 0
-; GCN-NEXT: s_cselect_b32 s16, s16, s18
+; GCN-NEXT: s_cselect_b32 s16, s21, s18
; GCN-NEXT: s_cselect_b32 s12, s12, s19
; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
; GCN-NEXT: s_subb_u32 s5, s13, s5
@@ -1923,9 +1931,11 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; TONGA-NEXT: v_readfirstlane_b32 s14, v0
; TONGA-NEXT: s_sub_u32 s12, s12, s14
; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s3, s3, s7
; TONGA-NEXT: s_sub_u32 s18, s12, s6
; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s19, s3, 0
; TONGA-NEXT: s_cmp_ge_u32 s19, s7
; TONGA-NEXT: s_cselect_b32 s20, -1, 0
@@ -1935,10 +1945,12 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; TONGA-NEXT: s_cselect_b32 s20, s21, s20
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s3, s3, s7
-; TONGA-NEXT: s_sub_u32 s16, s18, s6
+; TONGA-NEXT: s_sub_u32 s21, s18, s6
+; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s3, s3, 0
; TONGA-NEXT: s_cmp_lg_u32 s20, 0
-; TONGA-NEXT: s_cselect_b32 s16, s16, s18
+; TONGA-NEXT: s_cselect_b32 s16, s21, s18
; TONGA-NEXT: s_cselect_b32 s3, s3, s19
; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s5, s13, s5
@@ -2718,7 +2730,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7
; GCN-NEXT: s_sub_u32 s9, 0, s6
-; GCN-NEXT: s_subb_u32 s14, 0, s7
+; GCN-NEXT: s_subb_u32 s16, 0, s7
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2727,52 +2739,56 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s15, v1
-; GCN-NEXT: v_readfirstlane_b32 s16, v0
-; GCN-NEXT: s_mul_i32 s17, s9, s15
-; GCN-NEXT: s_mul_hi_u32 s19, s9, s16
-; GCN-NEXT: s_mul_i32 s18, s14, s16
-; GCN-NEXT: s_add_i32 s17, s19, s17
-; GCN-NEXT: s_add_i32 s17, s17, s18
-; GCN-NEXT: s_mul_i32 s20, s9, s16
-; GCN-NEXT: s_mul_i32 s19, s16, s17
-; GCN-NEXT: s_mul_hi_u32 s21, s16, s20
-; GCN-NEXT: s_mul_hi_u32 s18, s16, s17
+; GCN-NEXT: v_readfirstlane_b32 s17, v1
+; GCN-NEXT: v_readfirstlane_b32 s14, v0
+; GCN-NEXT: s_mul_i32 s15, s9, s17
+; GCN-NEXT: s_mul_hi_u32 s19, s9, s14
+; GCN-NEXT: s_mul_i32 s18, s16, s14
+; GCN-NEXT: s_add_i32 s15, s19, s15
+; GCN-NEXT: s_add_i32 s15, s15, s18
+; GCN-NEXT: s_mul_i32 s20, s9, s14
+; GCN-NEXT: s_mul_i32 s19, s14, s15
+; GCN-NEXT: s_mul_hi_u32 s21, s14, s20
+; GCN-NEXT: s_mul_hi_u32 s18, s14, s15
; GCN-NEXT: s_add_u32 s19, s21, s19
; GCN-NEXT: s_addc_u32 s18, 0, s18
-; GCN-NEXT: s_mul_hi_u32 s22, s15, s20
-; GCN-NEXT: s_mul_i32 s20, s15, s20
+; GCN-NEXT: s_mul_hi_u32 s22, s17, s20
+; GCN-NEXT: s_mul_i32 s20, s17, s20
; GCN-NEXT: s_add_u32 s19, s19, s20
-; GCN-NEXT: s_mul_hi_u32 s21, s15, s17
+; GCN-NEXT: s_mul_hi_u32 s21, s17, s15
; GCN-NEXT: s_addc_u32 s18, s18, s22
; GCN-NEXT: s_addc_u32 s19, s21, 0
-; GCN-NEXT: s_mul_i32 s17, s15, s17
-; GCN-NEXT: s_add_u32 s17, s18, s17
+; GCN-NEXT: s_mul_i32 s15, s17, s15
+; GCN-NEXT: s_add_u32 s15, s18, s15
; GCN-NEXT: s_addc_u32 s18, 0, s19
-; GCN-NEXT: s_add_u32 s16, s16, s17
-; GCN-NEXT: s_addc_u32 s15, s15, s18
-; GCN-NEXT: s_mul_i32 s17, s9, s15
-; GCN-NEXT: s_mul_hi_u32 s18, s9, s16
-; GCN-NEXT: s_add_i32 s17, s18, s17
-; GCN-NEXT: s_mul_i32 s14, s14, s16
-; GCN-NEXT: s_add_i32 s17, s17, s14
-; GCN-NEXT: s_mul_i32 s9, s9, s16
-; GCN-NEXT: s_mul_hi_u32 s18, s15, s9
-; GCN-NEXT: s_mul_i32 s19, s15, s9
-; GCN-NEXT: s_mul_i32 s21, s16, s17
-; GCN-NEXT: s_mul_hi_u32 s9, s16, s9
-; GCN-NEXT: s_mul_hi_u32 s20, s16, s17
+; GCN-NEXT: s_add_u32 s19, s14, s15
+; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
+; GCN-NEXT: s_addc_u32 s17, s17, s18
+; GCN-NEXT: s_mul_i32 s14, s9, s17
+; GCN-NEXT: s_mul_hi_u32 s15, s9, s19
+; GCN-NEXT: s_add_i32 s14, s15, s14
+; GCN-NEXT: s_mul_i32 s16, s16, s19
+; GCN-NEXT: s_add_i32 s14, s14, s16
+; GCN-NEXT: s_mul_i32 s9, s9, s19
+; GCN-NEXT: s_mul_hi_u32 s16, s17, s9
+; GCN-NEXT: s_mul_i32 s18, s17, s9
+; GCN-NEXT: s_mul_i32 s21, s19, s14
+; GCN-NEXT: s_mul_hi_u32 s9, s19, s9
+; GCN-NEXT: s_mul_hi_u32 s20, s19, s14
; GCN-NEXT: s_add_u32 s9, s9, s21
; GCN-NEXT: s_addc_u32 s20, 0, s20
-; GCN-NEXT: s_add_u32 s9, s9, s19
-; GCN-NEXT: s_mul_hi_u32 s14, s15, s17
-; GCN-NEXT: s_addc_u32 s9, s20, s18
-; GCN-NEXT: s_addc_u32 s14, s14, 0
-; GCN-NEXT: s_mul_i32 s17, s15, s17
-; GCN-NEXT: s_add_u32 s9, s9, s17
-; GCN-NEXT: s_addc_u32 s14, 0, s14
-; GCN-NEXT: s_add_u32 s9, s16, s9
-; GCN-NEXT: s_addc_u32 s18, s15, s14
+; GCN-NEXT: s_add_u32 s9, s9, s18
+; GCN-NEXT: s_mul_hi_u32 s15, s17, s14
+; GCN-NEXT: s_addc_u32 s9, s20, s16
+; GCN-NEXT: s_addc_u32 s15, s15, 0
+; GCN-NEXT: s_mul_i32 s14, s17, s14
+; GCN-NEXT: s_add_u32 s9, s9, s14
+; GCN-NEXT: s_addc_u32 s16, 0, s15
+; GCN-NEXT: s_add_u32 s9, s19, s9
+; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
+; GCN-NEXT: s_addc_u32 s18, s17, s16
; GCN-NEXT: s_ashr_i32 s14, s11, 31
; GCN-NEXT: s_add_u32 s16, s10, s14
; GCN-NEXT: s_mov_b32 s15, s14
@@ -2801,9 +2817,11 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s9, s6, s9
; GCN-NEXT: s_sub_u32 s9, s16, s9
; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s16, s20, s7
; GCN-NEXT: s_sub_u32 s22, s9, s6
; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s23, s16, 0
; GCN-NEXT: s_cmp_ge_u32 s23, s7
; GCN-NEXT: s_cselect_b32 s24, -1, 0
@@ -2813,10 +2831,12 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s24, s25, s24
; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s16, s16, s7
-; GCN-NEXT: s_sub_u32 s20, s22, s6
+; GCN-NEXT: s_sub_u32 s25, s22, s6
+; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s16, s16, 0
; GCN-NEXT: s_cmp_lg_u32 s24, 0
-; GCN-NEXT: s_cselect_b32 s20, s20, s22
+; GCN-NEXT: s_cselect_b32 s20, s25, s22
; GCN-NEXT: s_cselect_b32 s16, s16, s23
; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s11, s17, s11
@@ -2867,7 +2887,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11
; GCN-NEXT: s_sub_u32 s3, 0, s10
-; GCN-NEXT: s_subb_u32 s12, 0, s11
+; GCN-NEXT: s_subb_u32 s14, 0, s11
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2876,52 +2896,56 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s13, v1
-; GCN-NEXT: v_readfirstlane_b32 s14, v0
-; GCN-NEXT: s_mul_i32 s15, s3, s13
-; GCN-NEXT: s_mul_hi_u32 s17, s3, s14
-; GCN-NEXT: s_mul_i32 s16, s12, s14
-; GCN-NEXT: s_add_i32 s15, s17, s15
-; GCN-NEXT: s_add_i32 s15, s15, s16
-; GCN-NEXT: s_mul_i32 s18, s3, s14
-; GCN-NEXT: s_mul_i32 s17, s14, s15
-; GCN-NEXT: s_mul_hi_u32 s19, s14, s18
-; GCN-NEXT: s_mul_hi_u32 s16, s14, s15
+; GCN-NEXT: v_readfirstlane_b32 s15, v1
+; GCN-NEXT: v_readfirstlane_b32 s12, v0
+; GCN-NEXT: s_mul_i32 s13, s3, s15
+; GCN-NEXT: s_mul_hi_u32 s17, s3, s12
+; GCN-NEXT: s_mul_i32 s16, s14, s12
+; GCN-NEXT: s_add_i32 s13, s17, s13
+; GCN-NEXT: s_add_i32 s13, s13, s16
+; GCN-NEXT: s_mul_i32 s18, s3, s12
+; GCN-NEXT: s_mul_i32 s17, s12, s13
+; GCN-NEXT: s_mul_hi_u32 s19, s12, s18
+; GCN-NEXT: s_mul_hi_u32 s16, s12, s13
; GCN-NEXT: s_add_u32 s17, s19, s17
; GCN-NEXT: s_addc_u32 s16, 0, s16
-; GCN-NEXT: s_mul_hi_u32 s20, s13, s18
-; GCN-NEXT: s_mul_i32 s18, s13, s18
+; GCN-NEXT: s_mul_hi_u32 s20, s15, s18
+; GCN-NEXT: s_mul_i32 s18, s15, s18
; GCN-NEXT: s_add_u32 s17, s17, s18
-; GCN-NEXT: s_mul_hi_u32 s19, s13, s15
+; GCN-NEXT: s_mul_hi_u32 s19, s15, s13
; GCN-NEXT: s_addc_u32 s16, s16, s20
; GCN-NEXT: s_addc_u32 s17, s19, 0
-; GCN-NEXT: s_mul_i32 s15, s13, s15
-; GCN-NEXT: s_add_u32 s15, s16, s15
+; GCN-NEXT: s_mul_i32 s13, s15, s13
+; GCN-NEXT: s_add_u32 s13, s16, s13
; GCN-NEXT: s_addc_u32 s16, 0, s17
-; GCN-NEXT: s_add_u32 s14, s14, s15
-; GCN-NEXT: s_addc_u32 s13, s13, s16
-; GCN-NEXT: s_mul_i32 s15, s3, s13
-; GCN-NEXT: s_mul_hi_u32 s16, s3, s14
-; GCN-NEXT: s_add_i32 s15, s16, s15
-; GCN-NEXT: s_mul_i32 s12, s12, s14
-; GCN-NEXT: s_add_i32 s15, s15, s12
-; GCN-NEXT: s_mul_i32 s3, s3, s14
-; GCN-NEXT: s_mul_hi_u32 s16, s13, s3
-; GCN-NEXT: s_mul_i32 s17, s13, s3
-; GCN-NEXT: s_mul_i32 s19, s14, s15
-; GCN-NEXT: s_mul_hi_u32 s3, s14, s3
-; GCN-NEXT: s_mul_hi_u32 s18, s14, s15
+; GCN-NEXT: s_add_u32 s17, s12, s13
+; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
+; GCN-NEXT: s_addc_u32 s15, s15, s16
+; GCN-NEXT: s_mul_i32 s12, s3, s15
+; GCN-NEXT: s_mul_hi_u32 s13, s3, s17
+; GCN-NEXT: s_add_i32 s12, s13, s12
+; GCN-NEXT: s_mul_i32 s14, s14, s17
+; GCN-NEXT: s_add_i32 s12, s12, s14
+; GCN-NEXT: s_mul_i32 s3, s3, s17
+; GCN-NEXT: s_mul_hi_u32 s14, s15, s3
+; GCN-NEXT: s_mul_i32 s16, s15, s3
+; GCN-NEXT: s_mul_i32 s19, s17, s12
+; GCN-NEXT: s_mul_hi_u32 s3, s17, s3
+; GCN-NEXT: s_mul_hi_u32 s18, s17, s12
; GCN-NEXT: s_add_u32 s3, s3, s19
; GCN-NEXT: s_addc_u32 s18, 0, s18
-; GCN-NEXT: s_add_u32 s3, s3, s17
-; GCN-NEXT: s_mul_hi_u32 s12, s13, s15
-; GCN-NEXT: s_addc_u32 s3, s18, s16
-; GCN-NEXT: s_addc_u32 s12, s12, 0
-; GCN-NEXT: s_mul_i32 s15, s13, s15
-; GCN-NEXT: s_add_u32 s3, s3, s15
-; GCN-NEXT: s_addc_u32 s12, 0, s12
-; GCN-NEXT: s_add_u32 s3, s14, s3
-; GCN-NEXT: s_addc_u32 s16, s13, s12
+; GCN-NEXT: s_add_u32 s3, s3, s16
+; GCN-NEXT: s_mul_hi_u32 s13, s15, s12
+; GCN-NEXT: s_addc_u32 s3, s18, s14
+; GCN-NEXT: s_addc_u32 s13, s13, 0
+; GCN-NEXT: s_mul_i32 s12, s15, s12
+; GCN-NEXT: s_add_u32 s3, s3, s12
+; GCN-NEXT: s_addc_u32 s14, 0, s13
+; GCN-NEXT: s_add_u32 s3, s17, s3
+; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
+; GCN-NEXT: s_addc_u32 s16, s15, s14
; GCN-NEXT: s_ashr_i32 s12, s5, 31
; GCN-NEXT: s_add_u32 s14, s4, s12
; GCN-NEXT: s_mov_b32 s13, s12
@@ -2950,9 +2974,11 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s3, s10, s3
; GCN-NEXT: s_sub_u32 s3, s14, s3
; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s14, s18, s11
; GCN-NEXT: s_sub_u32 s20, s3, s10
; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s21, s14, 0
; GCN-NEXT: s_cmp_ge_u32 s21, s11
; GCN-NEXT: s_cselect_b32 s22, -1, 0
@@ -2962,10 +2988,12 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s22, s23, s22
; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s14, s14, s11
-; GCN-NEXT: s_sub_u32 s18, s20, s10
+; GCN-NEXT: s_sub_u32 s23, s20, s10
+; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s14, s14, 0
; GCN-NEXT: s_cmp_lg_u32 s22, 0
-; GCN-NEXT: s_cselect_b32 s18, s18, s20
+; GCN-NEXT: s_cselect_b32 s18, s23, s20
; GCN-NEXT: s_cselect_b32 s14, s14, s21
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s5, s15, s5
@@ -3435,9 +3463,11 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_readfirstlane_b32 s14, v0
; TONGA-NEXT: s_sub_u32 s12, s12, s14
; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s1, s1, s7
; TONGA-NEXT: s_sub_u32 s18, s12, s6
; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s19, s1, 0
; TONGA-NEXT: s_cmp_ge_u32 s19, s7
; TONGA-NEXT: s_cselect_b32 s20, -1, 0
@@ -3447,10 +3477,12 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: s_cselect_b32 s20, s21, s20
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s1, s1, s7
-; TONGA-NEXT: s_sub_u32 s16, s18, s6
+; TONGA-NEXT: s_sub_u32 s21, s18, s6
+; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s1, s1, 0
; TONGA-NEXT: s_cmp_lg_u32 s20, 0
-; TONGA-NEXT: s_cselect_b32 s16, s16, s18
+; TONGA-NEXT: s_cselect_b32 s16, s21, s18
; TONGA-NEXT: s_cselect_b32 s1, s1, s19
; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s3, s13, s3
@@ -4902,7 +4934,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7
; GCN-NEXT: s_sub_u32 s17, 0, s6
-; GCN-NEXT: s_subb_u32 s22, 0, s7
+; GCN-NEXT: s_subb_u32 s24, 0, s7
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -4911,52 +4943,56 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s23, v1
-; GCN-NEXT: v_readfirstlane_b32 s24, v0
-; GCN-NEXT: s_mul_i32 s25, s17, s23
-; GCN-NEXT: s_mul_hi_u32 s27, s17, s24
-; GCN-NEXT: s_mul_i32 s26, s22, s24
-; GCN-NEXT: s_add_i32 s25, s27, s25
-; GCN-NEXT: s_add_i32 s25, s25, s26
-; GCN-NEXT: s_mul_i32 s28, s17, s24
-; GCN-NEXT: s_mul_i32 s27, s24, s25
-; GCN-NEXT: s_mul_hi_u32 s29, s24, s28
-; GCN-NEXT: s_mul_hi_u32 s26, s24, s25
+; GCN-NEXT: v_readfirstlane_b32 s25, v1
+; GCN-NEXT: v_readfirstlane_b32 s22, v0
+; GCN-NEXT: s_mul_i32 s23, s17, s25
+; GCN-NEXT: s_mul_hi_u32 s27, s17, s22
+; GCN-NEXT: s_mul_i32 s26, s24, s22
+; GCN-NEXT: s_add_i32 s23, s27, s23
+; GCN-NEXT: s_add_i32 s23, s23, s26
+; GCN-NEXT: s_mul_i32 s28, s17, s22
+; GCN-NEXT: s_mul_i32 s27, s22, s23
+; GCN-NEXT: s_mul_hi_u32 s29, s22, s28
+; GCN-NEXT: s_mul_hi_u32 s26, s22, s23
; GCN-NEXT: s_add_u32 s27, s29, s27
; GCN-NEXT: s_addc_u32 s26, 0, s26
-; GCN-NEXT: s_mul_hi_u32 s30, s23, s28
-; GCN-NEXT: s_mul_i32 s28, s23, s28
+; GCN-NEXT: s_mul_hi_u32 s30, s25, s28
+; GCN-NEXT: s_mul_i32 s28, s25, s28
; GCN-NEXT: s_add_u32 s27, s27, s28
-; GCN-NEXT: s_mul_hi_u32 s29, s23, s25
+; GCN-NEXT: s_mul_hi_u32 s29, s25, s23
; GCN-NEXT: s_addc_u32 s26, s26, s30
; GCN-NEXT: s_addc_u32 s27, s29, 0
-; GCN-NEXT: s_mul_i32 s25, s23, s25
-; GCN-NEXT: s_add_u32 s25, s26, s25
+; GCN-NEXT: s_mul_i32 s23, s25, s23
+; GCN-NEXT: s_add_u32 s23, s26, s23
; GCN-NEXT: s_addc_u32 s26, 0, s27
-; GCN-NEXT: s_add_u32 s24, s24, s25
-; GCN-NEXT: s_addc_u32 s23, s23, s26
-; GCN-NEXT: s_mul_i32 s25, s17, s23
-; GCN-NEXT: s_mul_hi_u32 s26, s17, s24
-; GCN-NEXT: s_add_i32 s25, s26, s25
-; GCN-NEXT: s_mul_i32 s22, s22, s24
-; GCN-NEXT: s_add_i32 s25, s25, s22
-; GCN-NEXT: s_mul_i32 s17, s17, s24
-; GCN-NEXT: s_mul_hi_u32 s26, s23, s17
-; GCN-NEXT: s_mul_i32 s27, s23, s17
-; GCN-NEXT: s_mul_i32 s29, s24, s25
-; GCN-NEXT: s_mul_hi_u32 s17, s24, s17
-; GCN-NEXT: s_mul_hi_u32 s28, s24, s25
+; GCN-NEXT: s_add_u32 s27, s22, s23
+; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
+; GCN-NEXT: s_addc_u32 s25, s25, s26
+; GCN-NEXT: s_mul_i32 s22, s17, s25
+; GCN-NEXT: s_mul_hi_u32 s23, s17, s27
+; GCN-NEXT: s_add_i32 s22, s23, s22
+; GCN-NEXT: s_mul_i32 s24, s24, s27
+; GCN-NEXT: s_add_i32 s22, s22, s24
+; GCN-NEXT: s_mul_i32 s17, s17, s27
+; GCN-NEXT: s_mul_hi_u32 s24, s25, s17
+; GCN-NEXT: s_mul_i32 s26, s25, s17
+; GCN-NEXT: s_mul_i32 s29, s27, s22
+; GCN-NEXT: s_mul_hi_u32 s17, s27, s17
+; GCN-NEXT: s_mul_hi_u32 s28, s27, s22
; GCN-NEXT: s_add_u32 s17, s17, s29
; GCN-NEXT: s_addc_u32 s28, 0, s28
-; GCN-NEXT: s_add_u32 s17, s17, s27
-; GCN-NEXT: s_mul_hi_u32 s22, s23, s25
-; GCN-NEXT: s_addc_u32 s17, s28, s26
-; GCN-NEXT: s_addc_u32 s22, s22, 0
-; GCN-NEXT: s_mul_i32 s25, s23, s25
-; GCN-NEXT: s_add_u32 s17, s17, s25
-; GCN-NEXT: s_addc_u32 s22, 0, s22
-; GCN-NEXT: s_add_u32 s17, s24, s17
-; GCN-NEXT: s_addc_u32 s26, s23, s22
+; GCN-NEXT: s_add_u32 s17, s17, s26
+; GCN-NEXT: s_mul_hi_u32 s23, s25, s22
+; GCN-NEXT: s_addc_u32 s17, s28, s24
+; GCN-NEXT: s_addc_u32 s23, s23, 0
+; GCN-NEXT: s_mul_i32 s22, s25, s22
+; GCN-NEXT: s_add_u32 s17, s17, s22
+; GCN-NEXT: s_addc_u32 s24, 0, s23
+; GCN-NEXT: s_add_u32 s17, s27, s17
+; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
+; GCN-NEXT: s_addc_u32 s26, s25, s24
; GCN-NEXT: s_ashr_i32 s22, s19, 31
; GCN-NEXT: s_add_u32 s24, s18, s22
; GCN-NEXT: s_mov_b32 s23, s22
@@ -4985,9 +5021,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s17, s6, s17
; GCN-NEXT: s_sub_u32 s17, s24, s17
; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s24, s28, s7
; GCN-NEXT: s_sub_u32 s30, s17, s6
; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0
; GCN-NEXT: s_subb_u32 s31, s24, 0
; GCN-NEXT: s_cmp_ge_u32 s31, s7
; GCN-NEXT: s_cselect_b32 s33, -1, 0
@@ -4997,10 +5035,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s33, s34, s33
; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0
; GCN-NEXT: s_subb_u32 s24, s24, s7
-; GCN-NEXT: s_sub_u32 s28, s30, s6
+; GCN-NEXT: s_sub_u32 s34, s30, s6
+; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0
; GCN-NEXT: s_subb_u32 s24, s24, 0
; GCN-NEXT: s_cmp_lg_u32 s33, 0
-; GCN-NEXT: s_cselect_b32 s28, s28, s30
+; GCN-NEXT: s_cselect_b32 s28, s34, s30
; GCN-NEXT: s_cselect_b32 s24, s24, s31
; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s19, s25, s19
@@ -5051,7 +5091,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s18
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s19
; GCN-NEXT: s_sub_u32 s13, 0, s18
-; GCN-NEXT: s_subb_u32 s20, 0, s19
+; GCN-NEXT: s_subb_u32 s22, 0, s19
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5060,52 +5100,56 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s21, v1
-; GCN-NEXT: v_readfirstlane_b32 s22, v0
-; GCN-NEXT: s_mul_i32 s23, s13, s21
-; GCN-NEXT: s_mul_hi_u32 s25, s13, s22
-; GCN-NEXT: s_mul_i32 s24, s20, s22
-; GCN-NEXT: s_add_i32 s23, s25, s23
-; GCN-NEXT: s_add_i32 s23, s23, s24
-; GCN-NEXT: s_mul_i32 s26, s13, s22
-; GCN-NEXT: s_mul_i32 s25, s22, s23
-; GCN-NEXT: s_mul_hi_u32 s27, s22, s26
-; GCN-NEXT: s_mul_hi_u32 s24, s22, s23
+; GCN-NEXT: v_readfirstlane_b32 s23, v1
+; GCN-NEXT: v_readfirstlane_b32 s20, v0
+; GCN-NEXT: s_mul_i32 s21, s13, s23
+; GCN-NEXT: s_mul_hi_u32 s25, s13, s20
+; GCN-NEXT: s_mul_i32 s24, s22, s20
+; GCN-NEXT: s_add_i32 s21, s25, s21
+; GCN-NEXT: s_add_i32 s21, s21, s24
+; GCN-NEXT: s_mul_i32 s26, s13, s20
+; GCN-NEXT: s_mul_i32 s25, s20, s21
+; GCN-NEXT: s_mul_hi_u32 s27, s20, s26
+; GCN-NEXT: s_mul_hi_u32 s24, s20, s21
; GCN-NEXT: s_add_u32 s25, s27, s25
; GCN-NEXT: s_addc_u32 s24, 0, s24
-; GCN-NEXT: s_mul_hi_u32 s28, s21, s26
-; GCN-NEXT: s_mul_i32 s26, s21, s26
+; GCN-NEXT: s_mul_hi_u32 s28, s23, s26
+; GCN-NEXT: s_mul_i32 s26, s23, s26
; GCN-NEXT: s_add_u32 s25, s25, s26
-; GCN-NEXT: s_mul_hi_u32 s27, s21, s23
+; GCN-NEXT: s_mul_hi_u32 s27, s23, s21
; GCN-NEXT: s_addc_u32 s24, s24, s28
; GCN-NEXT: s_addc_u32 s25, s27, 0
-; GCN-NEXT: s_mul_i32 s23, s21, s23
-; GCN-NEXT: s_add_u32 s23, s24, s23
+; GCN-NEXT: s_mul_i32 s21, s23, s21
+; GCN-NEXT: s_add_u32 s21, s24, s21
; GCN-NEXT: s_addc_u32 s24, 0, s25
-; GCN-NEXT: s_add_u32 s22, s22, s23
-; GCN-NEXT: s_addc_u32 s21, s21, s24
-; GCN-NEXT: s_mul_i32 s23, s13, s21
-; GCN-NEXT: s_mul_hi_u32 s24, s13, s22
-; GCN-NEXT: s_add_i32 s23, s24, s23
-; GCN-NEXT: s_mul_i32 s20, s20, s22
-; GCN-NEXT: s_add_i32 s23, s23, s20
-; GCN-NEXT: s_mul_i32 s13, s13, s22
-; GCN-NEXT: s_mul_hi_u32 s24, s21, s13
-; GCN-NEXT: s_mul_i32 s25, s21, s13
-; GCN-NEXT: s_mul_i32 s27, s22, s23
-; GCN-NEXT: s_mul_hi_u32 s13, s22, s13
-; GCN-NEXT: s_mul_hi_u32 s26, s22, s23
+; GCN-NEXT: s_add_u32 s25, s20, s21
+; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
+; GCN-NEXT: s_addc_u32 s23, s23, s24
+; GCN-NEXT: s_mul_i32 s20, s13, s23
+; GCN-NEXT: s_mul_hi_u32 s21, s13, s25
+; GCN-NEXT: s_add_i32 s20, s21, s20
+; GCN-NEXT: s_mul_i32 s22, s22, s25
+; GCN-NEXT: s_add_i32 s20, s20, s22
+; GCN-NEXT: s_mul_i32 s13, s13, s25
+; GCN-NEXT: s_mul_hi_u32 s22, s23, s13
+; GCN-NEXT: s_mul_i32 s24, s23, s13
+; GCN-NEXT: s_mul_i32 s27, s25, s20
+; GCN-NEXT: s_mul_hi_u32 s13, s25, s13
+; GCN-NEXT: s_mul_hi_u32 s26, s25, s20
; GCN-NEXT: s_add_u32 s13, s13, s27
; GCN-NEXT: s_addc_u32 s26, 0, s26
-; GCN-NEXT: s_add_u32 s13, s13, s25
-; GCN-NEXT: s_mul_hi_u32 s20, s21, s23
-; GCN-NEXT: s_addc_u32 s13, s26, s24
-; GCN-NEXT: s_addc_u32 s20, s20, 0
-; GCN-NEXT: s_mul_i32 s23, s21, s23
-; GCN-NEXT: s_add_u32 s13, s13, s23
-; GCN-NEXT: s_addc_u32 s20, 0, s20
-; GCN-NEXT: s_add_u32 s13, s22, s13
-; GCN-NEXT: s_addc_u32 s24, s21, s20
+; GCN-NEXT: s_add_u32 s13, s13, s24
+; GCN-NEXT: s_mul_hi_u32 s21, s23, s20
+; GCN-NEXT: s_addc_u32 s13, s26, s22
+; GCN-NEXT: s_addc_u32 s21, s21, 0
+; GCN-NEXT: s_mul_i32 s20, s23, s20
+; GCN-NEXT: s_add_u32 s13, s13, s20
+; GCN-NEXT: s_addc_u32 s22, 0, s21
+; GCN-NEXT: s_add_u32 s13, s25, s13
+; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
+; GCN-NEXT: s_addc_u32 s24, s23, s22
; GCN-NEXT: s_ashr_i32 s20, s15, 31
; GCN-NEXT: s_add_u32 s22, s14, s20
; GCN-NEXT: s_mov_b32 s21, s20
@@ -5134,9 +5178,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s13, s18, s13
; GCN-NEXT: s_sub_u32 s13, s22, s13
; GCN-NEXT: s_cselect_b64 s[24:25], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0
; GCN-NEXT: s_subb_u32 s22, s26, s19
; GCN-NEXT: s_sub_u32 s28, s13, s18
; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s29, s22, 0
; GCN-NEXT: s_cmp_ge_u32 s29, s19
; GCN-NEXT: s_cselect_b32 s30, -1, 0
@@ -5146,10 +5192,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s30, s31, s30
; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s22, s22, s19
-; GCN-NEXT: s_sub_u32 s26, s28, s18
+; GCN-NEXT: s_sub_u32 s31, s28, s18
+; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s22, s22, 0
; GCN-NEXT: s_cmp_lg_u32 s30, 0
-; GCN-NEXT: s_cselect_b32 s26, s26, s28
+; GCN-NEXT: s_cselect_b32 s26, s31, s28
; GCN-NEXT: s_cselect_b32 s22, s22, s29
; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0
; GCN-NEXT: s_subb_u32 s15, s23, s15
@@ -5209,7 +5257,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s14
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s15
; GCN-NEXT: s_sub_u32 s9, 0, s14
-; GCN-NEXT: s_subb_u32 s16, 0, s15
+; GCN-NEXT: s_subb_u32 s18, 0, s15
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5218,52 +5266,56 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s17, v1
-; GCN-NEXT: v_readfirstlane_b32 s18, v0
-; GCN-NEXT: s_mul_i32 s19, s9, s17
-; GCN-NEXT: s_mul_hi_u32 s21, s9, s18
-; GCN-NEXT: s_mul_i32 s20, s16, s18
-; GCN-NEXT: s_add_i32 s19, s21, s19
-; GCN-NEXT: s_add_i32 s19, s19, s20
-; GCN-NEXT: s_mul_i32 s22, s9, s18
-; GCN-NEXT: s_mul_i32 s21, s18, s19
-; GCN-NEXT: s_mul_hi_u32 s23, s18, s22
-; GCN-NEXT: s_mul_hi_u32 s20, s18, s19
+; GCN-NEXT: v_readfirstlane_b32 s19, v1
+; GCN-NEXT: v_readfirstlane_b32 s16, v0
+; GCN-NEXT: s_mul_i32 s17, s9, s19
+; GCN-NEXT: s_mul_hi_u32 s21, s9, s16
+; GCN-NEXT: s_mul_i32 s20, s18, s16
+; GCN-NEXT: s_add_i32 s17, s21, s17
+; GCN-NEXT: s_add_i32 s17, s17, s20
+; GCN-NEXT: s_mul_i32 s22, s9, s16
+; GCN-NEXT: s_mul_i32 s21, s16, s17
+; GCN-NEXT: s_mul_hi_u32 s23, s16, s22
+; GCN-NEXT: s_mul_hi_u32 s20, s16, s17
; GCN-NEXT: s_add_u32 s21, s23, s21
; GCN-NEXT: s_addc_u32 s20, 0, s20
-; GCN-NEXT: s_mul_hi_u32 s24, s17, s22
-; GCN-NEXT: s_mul_i32 s22, s17, s22
+; GCN-NEXT: s_mul_hi_u32 s24, s19, s22
+; GCN-NEXT: s_mul_i32 s22, s19, s22
; GCN-NEXT: s_add_u32 s21, s21, s22
-; GCN-NEXT: s_mul_hi_u32 s23, s17, s19
+; GCN-NEXT: s_mul_hi_u32 s23, s19, s17
; GCN-NEXT: s_addc_u32 s20, s20, s24
; GCN-NEXT: s_addc_u32 s21, s23, 0
-; GCN-NEXT: s_mul_i32 s19, s17, s19
-; GCN-NEXT: s_add_u32 s19, s20, s19
+; GCN-NEXT: s_mul_i32 s17, s19, s17
+; GCN-NEXT: s_add_u32 s17, s20, s17
; GCN-NEXT: s_addc_u32 s20, 0, s21
-; GCN-NEXT: s_add_u32 s18, s18, s19
-; GCN-NEXT: s_addc_u32 s17, s17, s20
-; GCN-NEXT: s_mul_i32 s19, s9, s17
-; GCN-NEXT: s_mul_hi_u32 s20, s9, s18
-; GCN-NEXT: s_add_i32 s19, s20, s19
-; GCN-NEXT: s_mul_i32 s16, s16, s18
-; GCN-NEXT: s_add_i32 s19, s19, s16
-; GCN-NEXT: s_mul_i32 s9, s9, s18
-; GCN-NEXT: s_mul_hi_u32 s20, s17, s9
-; GCN-NEXT: s_mul_i32 s21, s17, s9
-; GCN-NEXT: s_mul_i32 s23, s18, s19
-; GCN-NEXT: s_mul_hi_u32 s9, s18, s9
-; GCN-NEXT: s_mul_hi_u32 s22, s18, s19
+; GCN-NEXT: s_add_u32 s21, s16, s17
+; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
+; GCN-NEXT: s_addc_u32 s19, s19, s20
+; GCN-NEXT: s_mul_i32 s16, s9, s19
+; GCN-NEXT: s_mul_hi_u32 s17, s9, s21
+; GCN-NEXT: s_add_i32 s16, s17, s16
+; GCN-NEXT: s_mul_i32 s18, s18, s21
+; GCN-NEXT: s_add_i32 s16, s16, s18
+; GCN-NEXT: s_mul_i32 s9, s9, s21
+; GCN-NEXT: s_mul_hi_u32 s18, s19, s9
+; GCN-NEXT: s_mul_i32 s20, s19, s9
+; GCN-NEXT: s_mul_i32 s23, s21, s16
+; GCN-NEXT: s_mul_hi_u32 s9, s21, s9
+; GCN-NEXT: s_mul_hi_u32 s22, s21, s16
; GCN-NEXT: s_add_u32 s9, s9, s23
; GCN-NEXT: s_addc_u32 s22, 0, s22
-; GCN-NEXT: s_add_u32 s9, s9, s21
-; GCN-NEXT: s_mul_hi_u32 s16, s17, s19
-; GCN-NEXT: s_addc_u32 s9, s22, s20
-; GCN-NEXT: s_addc_u32 s16, s16, 0
-; GCN-NEXT: s_mul_i32 s19, s17, s19
-; GCN-NEXT: s_add_u32 s9, s9, s19
-; GCN-NEXT: s_addc_u32 s16, 0, s16
-; GCN-NEXT: s_add_u32 s9, s18, s9
-; GCN-NEXT: s_addc_u32 s20, s17, s16
+; GCN-NEXT: s_add_u32 s9, s9, s20
+; GCN-NEXT: s_mul_hi_u32 s17, s19, s16
+; GCN-NEXT: s_addc_u32 s9, s22, s18
+; GCN-NEXT: s_addc_u32 s17, s17, 0
+; GCN-NEXT: s_mul_i32 s16, s19, s16
+; GCN-NEXT: s_add_u32 s9, s9, s16
+; GCN-NEXT: s_addc_u32 s18, 0, s17
+; GCN-NEXT: s_add_u32 s9, s21, s9
+; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
+; GCN-NEXT: s_addc_u32 s20, s19, s18
; GCN-NEXT: s_ashr_i32 s16, s11, 31
; GCN-NEXT: s_add_u32 s18, s10, s16
; GCN-NEXT: s_mov_b32 s17, s16
@@ -5292,9 +5344,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s9, s14, s9
; GCN-NEXT: s_sub_u32 s9, s18, s9
; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s18, s22, s15
; GCN-NEXT: s_sub_u32 s24, s9, s14
; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
; GCN-NEXT: s_subb_u32 s25, s18, 0
; GCN-NEXT: s_cmp_ge_u32 s25, s15
; GCN-NEXT: s_cselect_b32 s26, -1, 0
@@ -5304,10 +5358,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s26, s27, s26
; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
; GCN-NEXT: s_subb_u32 s18, s18, s15
-; GCN-NEXT: s_sub_u32 s22, s24, s14
+; GCN-NEXT: s_sub_u32 s27, s24, s14
+; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
; GCN-NEXT: s_subb_u32 s18, s18, 0
; GCN-NEXT: s_cmp_lg_u32 s26, 0
-; GCN-NEXT: s_cselect_b32 s22, s22, s24
+; GCN-NEXT: s_cselect_b32 s22, s27, s24
; GCN-NEXT: s_cselect_b32 s18, s18, s25
; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s11, s19, s11
@@ -5364,7 +5420,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11
; GCN-NEXT: s_sub_u32 s3, 0, s10
-; GCN-NEXT: s_subb_u32 s12, 0, s11
+; GCN-NEXT: s_subb_u32 s14, 0, s11
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5373,52 +5429,56 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s13, v1
-; GCN-NEXT: v_readfirstlane_b32 s14, v0
-; GCN-NEXT: s_mul_i32 s15, s3, s13
-; GCN-NEXT: s_mul_hi_u32 s17, s3, s14
-; GCN-NEXT: s_mul_i32 s16, s12, s14
-; GCN-NEXT: s_add_i32 s15, s17, s15
-; GCN-NEXT: s_add_i32 s15, s15, s16
-; GCN-NEXT: s_mul_i32 s18, s3, s14
-; GCN-NEXT: s_mul_i32 s17, s14, s15
-; GCN-NEXT: s_mul_hi_u32 s19, s14, s18
-; GCN-NEXT: s_mul_hi_u32 s16, s14, s15
+; GCN-NEXT: v_readfirstlane_b32 s15, v1
+; GCN-NEXT: v_readfirstlane_b32 s12, v0
+; GCN-NEXT: s_mul_i32 s13, s3, s15
+; GCN-NEXT: s_mul_hi_u32 s17, s3, s12
+; GCN-NEXT: s_mul_i32 s16, s14, s12
+; GCN-NEXT: s_add_i32 s13, s17, s13
+; GCN-NEXT: s_add_i32 s13, s13, s16
+; GCN-NEXT: s_mul_i32 s18, s3, s12
+; GCN-NEXT: s_mul_i32 s17, s12, s13
+; GCN-NEXT: s_mul_hi_u32 s19, s12, s18
+; GCN-NEXT: s_mul_hi_u32 s16, s12, s13
; GCN-NEXT: s_add_u32 s17, s19, s17
; GCN-NEXT: s_addc_u32 s16, 0, s16
-; GCN-NEXT: s_mul_hi_u32 s20, s13, s18
-; GCN-NEXT: s_mul_i32 s18, s13, s18
+; GCN-NEXT: s_mul_hi_u32 s20, s15, s18
+; GCN-NEXT: s_mul_i32 s18, s15, s18
; GCN-NEXT: s_add_u32 s17, s17, s18
-; GCN-NEXT: s_mul_hi_u32 s19, s13, s15
+; GCN-NEXT: s_mul_hi_u32 s19, s15, s13
; GCN-NEXT: s_addc_u32 s16, s16, s20
; GCN-NEXT: s_addc_u32 s17, s19, 0
-; GCN-NEXT: s_mul_i32 s15, s13, s15
-; GCN-NEXT: s_add_u32 s15, s16, s15
+; GCN-NEXT: s_mul_i32 s13, s15, s13
+; GCN-NEXT: s_add_u32 s13, s16, s13
; GCN-NEXT: s_addc_u32 s16, 0, s17
-; GCN-NEXT: s_add_u32 s14, s14, s15
-; GCN-NEXT: s_addc_u32 s13, s13, s16
-; GCN-NEXT: s_mul_i32 s15, s3, s13
-; GCN-NEXT: s_mul_hi_u32 s16, s3, s14
-; GCN-NEXT: s_add_i32 s15, s16, s15
-; GCN-NEXT: s_mul_i32 s12, s12, s14
-; GCN-NEXT: s_add_i32 s15, s15, s12
-; GCN-NEXT: s_mul_i32 s3, s3, s14
-; GCN-NEXT: s_mul_hi_u32 s16, s13, s3
-; GCN-NEXT: s_mul_i32 s17, s13, s3
-; GCN-NEXT: s_mul_i32 s19, s14, s15
-; GCN-NEXT: s_mul_hi_u32 s3, s14, s3
-; GCN-NEXT: s_mul_hi_u32 s18, s14, s15
+; GCN-NEXT: s_add_u32 s17, s12, s13
+; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
+; GCN-NEXT: s_addc_u32 s15, s15, s16
+; GCN-NEXT: s_mul_i32 s12, s3, s15
+; GCN-NEXT: s_mul_hi_u32 s13, s3, s17
+; GCN-NEXT: s_add_i32 s12, s13, s12
+; GCN-NEXT: s_mul_i32 s14, s14, s17
+; GCN-NEXT: s_add_i32 s12, s12, s14
+; GCN-NEXT: s_mul_i32 s3, s3, s17
+; GCN-NEXT: s_mul_hi_u32 s14, s15, s3
+; GCN-NEXT: s_mul_i32 s16, s15, s3
+; GCN-NEXT: s_mul_i32 s19, s17, s12
+; GCN-NEXT: s_mul_hi_u32 s3, s17, s3
+; GCN-NEXT: s_mul_hi_u32 s18, s17, s12
; GCN-NEXT: s_add_u32 s3, s3, s19
; GCN-NEXT: s_addc_u32 s18, 0, s18
-; GCN-NEXT: s_add_u32 s3, s3, s17
-; GCN-NEXT: s_mul_hi_u32 s12, s13, s15
-; GCN-NEXT: s_addc_u32 s3, s18, s16
-; GCN-NEXT: s_addc_u32 s12, s12, 0
-; GCN-NEXT: s_mul_i32 s15, s13, s15
-; GCN-NEXT: s_add_u32 s3, s3, s15
-; GCN-NEXT: s_addc_u32 s12, 0, s12
-; GCN-NEXT: s_add_u32 s3, s14, s3
-; GCN-NEXT: s_addc_u32 s16, s13, s12
+; GCN-NEXT: s_add_u32 s3, s3, s16
+; GCN-NEXT: s_mul_hi_u32 s13, s15, s12
+; GCN-NEXT: s_addc_u32 s3, s18, s14
+; GCN-NEXT: s_addc_u32 s13, s13, 0
+; GCN-NEXT: s_mul_i32 s12, s15, s12
+; GCN-NEXT: s_add_u32 s3, s3, s12
+; GCN-NEXT: s_addc_u32 s14, 0, s13
+; GCN-NEXT: s_add_u32 s3, s17, s3
+; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
+; GCN-NEXT: s_addc_u32 s16, s15, s14
; GCN-NEXT: s_ashr_i32 s12, s5, 31
; GCN-NEXT: s_add_u32 s14, s4, s12
; GCN-NEXT: s_mov_b32 s13, s12
@@ -5447,9 +5507,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s3, s10, s3
; GCN-NEXT: s_sub_u32 s3, s14, s3
; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s14, s18, s11
; GCN-NEXT: s_sub_u32 s20, s3, s10
; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s21, s14, 0
; GCN-NEXT: s_cmp_ge_u32 s21, s11
; GCN-NEXT: s_cselect_b32 s22, -1, 0
@@ -5459,10 +5521,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s22, s23, s22
; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s14, s14, s11
-; GCN-NEXT: s_sub_u32 s18, s20, s10
+; GCN-NEXT: s_sub_u32 s23, s20, s10
+; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s14, s14, 0
; GCN-NEXT: s_cmp_lg_u32 s22, 0
-; GCN-NEXT: s_cselect_b32 s18, s18, s20
+; GCN-NEXT: s_cselect_b32 s18, s23, s20
; GCN-NEXT: s_cselect_b32 s14, s14, s21
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s5, s15, s5
@@ -6235,9 +6299,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_readfirstlane_b32 s14, v8
; TONGA-NEXT: s_sub_u32 s12, s12, s14
; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s1, s1, s7
; TONGA-NEXT: s_sub_u32 s18, s12, s6
; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s19, s1, 0
; TONGA-NEXT: s_cmp_ge_u32 s19, s7
; TONGA-NEXT: s_cselect_b32 s20, -1, 0
@@ -6247,10 +6313,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: s_cselect_b32 s20, s21, s20
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s1, s1, s7
-; TONGA-NEXT: s_sub_u32 s16, s18, s6
+; TONGA-NEXT: s_sub_u32 s21, s18, s6
+; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s1, s1, 0
; TONGA-NEXT: s_cmp_lg_u32 s20, 0
-; TONGA-NEXT: s_cselect_b32 s16, s16, s18
+; TONGA-NEXT: s_cselect_b32 s16, s21, s18
; TONGA-NEXT: s_cselect_b32 s1, s1, s19
; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s3, s13, s3