diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/srem64.ll')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/srem64.ll | 416 |
1 files changed, 189 insertions, 227 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index ea9bb04..02d2e6c 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -8,12 +8,11 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GCN-NEXT: s_sub_u32 s10, 0, s8 -; GCN-NEXT: s_subb_u32 s11, 0, s9 +; GCN-NEXT: s_sub_u32 s0, 0, s8 +; GCN-NEXT: s_subb_u32 s1, 0, s9 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -22,69 +21,65 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_hi_u32 v2, s10, v0 -; GCN-NEXT: v_readfirstlane_b32 s12, v1 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: s_mul_i32 s1, s10, s12 -; GCN-NEXT: v_readfirstlane_b32 s15, v2 -; GCN-NEXT: s_mul_i32 s13, s11, s0 -; GCN-NEXT: s_mul_i32 s14, s10, s0 -; GCN-NEXT: s_add_i32 s1, s15, s1 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s14 -; GCN-NEXT: s_add_i32 s1, s1, s13 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s1 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s14 -; GCN-NEXT: v_readfirstlane_b32 s13, v3 -; GCN-NEXT: s_mul_i32 s15, s0, s1 -; GCN-NEXT: v_mul_hi_u32 v1, v1, s1 -; GCN-NEXT: s_add_u32 s13, s13, s15 +; GCN-NEXT: v_mul_hi_u32 v2, s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s10, v1 +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: s_mul_i32 s11, s0, s10 +; GCN-NEXT: v_readfirstlane_b32 s14, v2 +; GCN-NEXT: s_mul_i32 s12, s1, s2 +; GCN-NEXT: s_mul_i32 s13, s0, s2 +; GCN-NEXT: s_add_i32 s11, s14, s11 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s13 +; GCN-NEXT: s_add_i32 s11, s11, s12 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s11 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s13 +; GCN-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NEXT: s_mul_i32 s15, s2, s11 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s11 +; GCN-NEXT: s_add_u32 s12, s12, s15 ; GCN-NEXT: v_readfirstlane_b32 s15, v0 -; GCN-NEXT: s_mul_i32 s14, s12, s14 +; GCN-NEXT: s_mul_i32 s13, s10, s13 ; GCN-NEXT: s_addc_u32 s15, 0, s15 -; GCN-NEXT: v_readfirstlane_b32 s16, v4 -; GCN-NEXT: s_add_u32 s13, s13, s14 -; GCN-NEXT: s_addc_u32 s13, s15, s16 -; GCN-NEXT: v_readfirstlane_b32 s14, v1 -; GCN-NEXT: s_addc_u32 s14, s14, 0 -; GCN-NEXT: s_mul_i32 s1, s12, s1 -; GCN-NEXT: s_add_u32 s1, s13, s1 -; GCN-NEXT: s_addc_u32 s13, 0, s14 -; GCN-NEXT: s_add_u32 s14, s0, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_addc_u32 s12, s12, s13 -; GCN-NEXT: s_mul_i32 s0, s10, s12 -; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_add_i32 s0, s1, s0 -; GCN-NEXT: s_mul_i32 s11, s11, s14 -; GCN-NEXT: s_mul_i32 s1, s10, s14 -; GCN-NEXT: s_add_i32 s0, s0, s11 -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mul_hi_u32 v3, s12, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s14, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s12, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s14, v0 -; GCN-NEXT: s_mul_i32 s11, s14, s0 -; GCN-NEXT: v_readfirstlane_b32 s15, v2 -; GCN-NEXT: s_add_u32 s11, s15, s11 +; GCN-NEXT: v_readfirstlane_b32 s14, v4 +; GCN-NEXT: s_add_u32 s12, s12, s13 +; GCN-NEXT: s_addc_u32 s12, s15, s14 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_mul_i32 s11, s10, s11 +; GCN-NEXT: s_add_u32 s11, s12, s11 +; GCN-NEXT: s_addc_u32 s12, 0, s13 +; GCN-NEXT: s_add_u32 s11, s2, s11 +; GCN-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 +; GCN-NEXT: s_addc_u32 s10, s10, s12 +; GCN-NEXT: s_mul_i32 s12, s0, s10 +; GCN-NEXT: s_mul_i32 s1, s1, s11 ; GCN-NEXT: v_readfirstlane_b32 s13, v0 -; GCN-NEXT: s_mul_i32 s1, s12, s1 -; GCN-NEXT: s_addc_u32 s13, 0, s13 -; GCN-NEXT: v_readfirstlane_b32 s10, v3 -; GCN-NEXT: s_add_u32 s1, s11, s1 -; GCN-NEXT: s_addc_u32 s1, s13, s10 -; GCN-NEXT: v_readfirstlane_b32 s10, v1 -; GCN-NEXT: s_addc_u32 s10, s10, 0 -; GCN-NEXT: s_mul_i32 s0, s12, s0 -; GCN-NEXT: s_add_u32 s0, s1, s0 -; GCN-NEXT: s_addc_u32 s10, 0, s10 -; GCN-NEXT: s_add_u32 s11, s14, s0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_addc_u32 s1, s12, s10 +; GCN-NEXT: s_add_i32 s12, s13, s12 +; GCN-NEXT: s_mul_i32 s0, s0, s11 +; GCN-NEXT: s_add_i32 s1, s12, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mul_hi_u32 v3, s10, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s10, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: s_mul_i32 s13, s11, s1 +; GCN-NEXT: v_readfirstlane_b32 s15, v2 +; GCN-NEXT: s_add_u32 s13, s15, s13 +; GCN-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-NEXT: s_mul_i32 s0, s10, s0 +; GCN-NEXT: s_addc_u32 s14, 0, s14 +; GCN-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NEXT: s_add_u32 s0, s13, s0 +; GCN-NEXT: s_addc_u32 s0, s14, s12 +; GCN-NEXT: v_readfirstlane_b32 s12, v1 +; GCN-NEXT: s_addc_u32 s12, s12, 0 +; GCN-NEXT: s_mul_i32 s1, s10, s1 +; GCN-NEXT: s_add_u32 s0, s0, s1 +; GCN-NEXT: s_addc_u32 s1, 0, s12 +; GCN-NEXT: s_add_u32 s11, s11, s0 +; GCN-NEXT: s_addc_u32 s1, s10, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 ; GCN-NEXT: v_mov_b32_e32 v2, s11 @@ -118,11 +113,9 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_mul_i32 s4, s8, s4 ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s11, s4, s5 ; GCN-NEXT: s_subb_u32 s13, s10, s9 ; GCN-NEXT: s_sub_u32 s14, s6, s8 ; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s15, s10, s11 ; GCN-NEXT: s_subb_u32 s15, s13, 0 ; GCN-NEXT: s_cmp_ge_u32 s15, s9 ; GCN-NEXT: s_cselect_b32 s16, -1, 0 @@ -131,13 +124,11 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_cmp_eq_u32 s15, s9 ; GCN-NEXT: s_cselect_b32 s16, s17, s16 ; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s13, s13, s9 -; GCN-NEXT: s_sub_u32 s17, s14, s8 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s10, s13, 0 +; GCN-NEXT: s_subb_u32 s10, s13, s9 +; GCN-NEXT: s_sub_u32 s11, s14, s8 +; GCN-NEXT: s_subb_u32 s10, s10, 0 ; GCN-NEXT: s_cmp_lg_u32 s16, 0 -; GCN-NEXT: s_cselect_b32 s11, s17, s14 +; GCN-NEXT: s_cselect_b32 s11, s11, s14 ; GCN-NEXT: s_cselect_b32 s10, s10, s15 ; GCN-NEXT: s_or_b32 s4, s4, s5 ; GCN-NEXT: s_subb_u32 s4, s7, s12 @@ -150,6 +141,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_cmp_lg_u32 s5, 0 ; GCN-NEXT: s_cselect_b32 s4, s10, s4 ; GCN-NEXT: s_cselect_b32 s5, s11, s6 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -180,8 +172,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 -; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-IR-NEXT: s_or_b32 s8, s8, s9 ; GCN-IR-NEXT: s_addc_u32 s8, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -213,8 +203,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_sub_u32 s12, s12, s18 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s19 ; GCN-IR-NEXT: s_add_u32 s16, s16, 1 -; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GCN-IR-NEXT: s_or_b32 s18, s18, s19 ; GCN-IR-NEXT: s_addc_u32 s17, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] @@ -968,81 +956,76 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5] ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GCN-NEXT: s_sub_u32 s10, 0, s4 -; GCN-NEXT: s_subb_u32 s11, 0, s5 +; GCN-NEXT: s_sub_u32 s8, 0, s4 +; GCN-NEXT: s_subb_u32 s9, 0, s5 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_hi_u32 v2, s10, v0 -; GCN-NEXT: v_readfirstlane_b32 s12, v1 -; GCN-NEXT: v_readfirstlane_b32 s8, v0 -; GCN-NEXT: s_mul_i32 s9, s10, s12 -; GCN-NEXT: v_readfirstlane_b32 s15, v2 -; GCN-NEXT: s_mul_i32 s13, s11, s8 -; GCN-NEXT: s_mul_i32 s14, s10, s8 -; GCN-NEXT: s_add_i32 s9, s15, s9 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s14 -; GCN-NEXT: s_add_i32 s9, s9, s13 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s9 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s14 -; GCN-NEXT: v_readfirstlane_b32 s13, v3 -; GCN-NEXT: s_mul_i32 s15, s8, s9 -; GCN-NEXT: s_add_u32 s13, s13, s15 -; GCN-NEXT: v_readfirstlane_b32 s15, v0 -; GCN-NEXT: v_mul_hi_u32 v0, v1, s9 -; GCN-NEXT: s_addc_u32 s15, 0, s15 -; GCN-NEXT: s_mul_i32 s14, s12, s14 -; GCN-NEXT: v_readfirstlane_b32 s16, v4 -; GCN-NEXT: s_add_u32 s13, s13, s14 -; GCN-NEXT: s_addc_u32 s13, s15, s16 +; GCN-NEXT: v_mul_hi_u32 v2, s8, v0 +; GCN-NEXT: v_readfirstlane_b32 s10, v1 +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: s_mul_i32 s11, s8, s10 +; GCN-NEXT: v_readfirstlane_b32 s14, v2 +; GCN-NEXT: s_mul_i32 s12, s9, s2 +; GCN-NEXT: s_mul_i32 s13, s8, s2 +; GCN-NEXT: s_add_i32 s11, s14, s11 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s13 +; GCN-NEXT: s_add_i32 s11, s11, s12 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s11 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s13 +; GCN-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NEXT: s_mul_i32 s14, s2, s11 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s11 +; GCN-NEXT: s_add_u32 s12, s12, s14 ; GCN-NEXT: v_readfirstlane_b32 s14, v0 -; GCN-NEXT: s_addc_u32 s14, s14, 0 -; GCN-NEXT: s_mul_i32 s9, s12, s9 -; GCN-NEXT: s_add_u32 s9, s13, s9 -; GCN-NEXT: s_addc_u32 s13, 0, s14 -; GCN-NEXT: s_add_u32 s14, s8, s9 -; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_addc_u32 s12, s12, s13 -; GCN-NEXT: s_mul_i32 s8, s10, s12 -; GCN-NEXT: v_readfirstlane_b32 s9, v0 -; GCN-NEXT: s_add_i32 s8, s9, s8 -; GCN-NEXT: s_mul_i32 s11, s11, s14 -; GCN-NEXT: s_mul_i32 s9, s10, s14 -; GCN-NEXT: s_add_i32 s8, s8, s11 -; GCN-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mul_hi_u32 v3, s12, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s14, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s12, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s14, v0 -; GCN-NEXT: s_mul_i32 s11, s14, s8 -; GCN-NEXT: v_readfirstlane_b32 s15, v2 -; GCN-NEXT: s_add_u32 s11, s15, s11 +; GCN-NEXT: s_addc_u32 s14, 0, s14 +; GCN-NEXT: s_mul_i32 s13, s10, s13 +; GCN-NEXT: v_readfirstlane_b32 s15, v4 +; GCN-NEXT: s_add_u32 s12, s12, s13 +; GCN-NEXT: s_addc_u32 s12, s14, s15 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_mul_i32 s11, s10, s11 +; GCN-NEXT: s_add_u32 s11, s12, s11 +; GCN-NEXT: s_addc_u32 s12, 0, s13 +; GCN-NEXT: s_add_u32 s11, s2, s11 +; GCN-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NEXT: v_mul_hi_u32 v0, s8, v0 +; GCN-NEXT: s_addc_u32 s10, s10, s12 +; GCN-NEXT: s_mul_i32 s12, s8, s10 +; GCN-NEXT: s_mul_i32 s9, s9, s11 ; GCN-NEXT: v_readfirstlane_b32 s13, v0 -; GCN-NEXT: s_mul_i32 s9, s12, s9 -; GCN-NEXT: s_addc_u32 s13, 0, s13 -; GCN-NEXT: v_readfirstlane_b32 s10, v3 -; GCN-NEXT: s_add_u32 s9, s11, s9 -; GCN-NEXT: s_addc_u32 s9, s13, s10 -; GCN-NEXT: v_readfirstlane_b32 s10, v1 -; GCN-NEXT: s_addc_u32 s10, s10, 0 -; GCN-NEXT: s_mul_i32 s8, s12, s8 -; GCN-NEXT: s_add_u32 s8, s9, s8 -; GCN-NEXT: s_addc_u32 s10, 0, s10 -; GCN-NEXT: s_add_u32 s11, s14, s8 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_addc_u32 s10, s12, s10 +; GCN-NEXT: s_add_i32 s12, s13, s12 +; GCN-NEXT: s_mul_i32 s8, s8, s11 +; GCN-NEXT: s_add_i32 s9, s12, s9 +; GCN-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NEXT: v_mul_hi_u32 v3, s10, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s10, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: s_mul_i32 s13, s11, s9 +; GCN-NEXT: v_readfirstlane_b32 s15, v2 +; GCN-NEXT: s_add_u32 s13, s15, s13 +; GCN-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-NEXT: s_mul_i32 s8, s10, s8 +; GCN-NEXT: s_addc_u32 s14, 0, s14 +; GCN-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NEXT: s_add_u32 s8, s13, s8 +; GCN-NEXT: s_addc_u32 s8, s14, s12 +; GCN-NEXT: v_readfirstlane_b32 s12, v1 +; GCN-NEXT: s_addc_u32 s12, s12, 0 +; GCN-NEXT: s_mul_i32 s9, s10, s9 +; GCN-NEXT: s_add_u32 s8, s8, s9 +; GCN-NEXT: s_addc_u32 s9, 0, s12 +; GCN-NEXT: s_add_u32 s11, s11, s8 +; GCN-NEXT: s_addc_u32 s10, s10, s9 ; GCN-NEXT: s_ashr_i32 s8, s7, 31 ; GCN-NEXT: s_add_u32 s6, s6, s8 ; GCN-NEXT: s_mov_b32 s9, s8 @@ -1071,6 +1054,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 ; GCN-NEXT: s_addc_u32 s11, 0, s12 ; GCN-NEXT: s_mul_i32 s11, s4, s11 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_readfirstlane_b32 s12, v0 ; GCN-NEXT: s_add_i32 s11, s12, s11 ; GCN-NEXT: s_mul_i32 s12, s5, s10 @@ -1079,11 +1063,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_mul_i32 s10, s4, s10 ; GCN-NEXT: s_sub_u32 s6, s6, s10 ; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s13, s10, s11 ; GCN-NEXT: s_subb_u32 s15, s12, s5 ; GCN-NEXT: s_sub_u32 s16, s6, s4 ; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-NEXT: s_or_b32 s17, s12, s13 ; GCN-NEXT: s_subb_u32 s17, s15, 0 ; GCN-NEXT: s_cmp_ge_u32 s17, s5 ; GCN-NEXT: s_cselect_b32 s18, -1, 0 @@ -1092,13 +1074,11 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_cmp_eq_u32 s17, s5 ; GCN-NEXT: s_cselect_b32 s18, s19, s18 ; GCN-NEXT: s_or_b32 s12, s12, s13 -; GCN-NEXT: s_subb_u32 s15, s15, s5 -; GCN-NEXT: s_sub_u32 s19, s16, s4 -; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-NEXT: s_or_b32 s12, s12, s13 -; GCN-NEXT: s_subb_u32 s12, s15, 0 +; GCN-NEXT: s_subb_u32 s12, s15, s5 +; GCN-NEXT: s_sub_u32 s13, s16, s4 +; GCN-NEXT: s_subb_u32 s12, s12, 0 ; GCN-NEXT: s_cmp_lg_u32 s18, 0 -; GCN-NEXT: s_cselect_b32 s13, s19, s16 +; GCN-NEXT: s_cselect_b32 s13, s13, s16 ; GCN-NEXT: s_cselect_b32 s12, s12, s17 ; GCN-NEXT: s_or_b32 s10, s10, s11 ; GCN-NEXT: s_subb_u32 s7, s7, s14 @@ -1156,8 +1136,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s16, s14, 1 -; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-IR-NEXT: s_or_b32 s10, s10, s11 ; GCN-IR-NEXT: s_addc_u32 s10, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s14, 63, s14 @@ -1189,8 +1167,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_sub_u32 s14, s14, s20 ; GCN-IR-NEXT: s_subb_u32 s15, s15, s21 ; GCN-IR-NEXT: s_add_u32 s18, s18, 1 -; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 -; GCN-IR-NEXT: s_or_b32 s20, s20, s21 ; GCN-IR-NEXT: s_addc_u32 s19, s19, 0 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3] @@ -1316,8 +1292,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s5 ; GCN-NEXT: s_sub_u32 s2, 0, s4 -; GCN-NEXT: s_subb_u32 s8, 0, s5 -; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_subb_u32 s6, 0, s5 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1327,72 +1302,68 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_mul_hi_u32 v2, s2, v0 -; GCN-NEXT: v_readfirstlane_b32 s9, v1 -; GCN-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-NEXT: s_mul_i32 s7, s2, s9 -; GCN-NEXT: v_readfirstlane_b32 s12, v2 -; GCN-NEXT: s_mul_i32 s10, s8, s6 -; GCN-NEXT: s_mul_i32 s11, s2, s6 -; GCN-NEXT: s_add_i32 s7, s12, s7 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s11 -; GCN-NEXT: s_add_i32 s7, s7, s10 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s7 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s11 -; GCN-NEXT: v_readfirstlane_b32 s10, v3 -; GCN-NEXT: s_mul_i32 s13, s6, s7 -; GCN-NEXT: v_mul_hi_u32 v1, v1, s7 -; GCN-NEXT: s_add_u32 s10, s10, s13 -; GCN-NEXT: v_readfirstlane_b32 s13, v0 -; GCN-NEXT: s_mul_i32 s11, s9, s11 -; GCN-NEXT: s_addc_u32 s13, 0, s13 -; GCN-NEXT: v_readfirstlane_b32 s12, v4 -; GCN-NEXT: s_add_u32 s10, s10, s11 -; GCN-NEXT: s_addc_u32 s10, s13, s12 -; GCN-NEXT: v_readfirstlane_b32 s11, v1 -; GCN-NEXT: s_addc_u32 s11, s11, 0 -; GCN-NEXT: s_mul_i32 s7, s9, s7 -; GCN-NEXT: s_add_u32 s7, s10, s7 -; GCN-NEXT: s_addc_u32 s10, 0, s11 -; GCN-NEXT: s_add_u32 s11, s6, s7 -; GCN-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NEXT: v_readfirstlane_b32 s7, v1 +; GCN-NEXT: v_readfirstlane_b32 s3, v0 +; GCN-NEXT: s_mul_i32 s8, s2, s7 +; GCN-NEXT: v_readfirstlane_b32 s11, v2 +; GCN-NEXT: s_mul_i32 s9, s6, s3 +; GCN-NEXT: s_mul_i32 s10, s2, s3 +; GCN-NEXT: s_add_i32 s8, s11, s8 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s10 +; GCN-NEXT: s_add_i32 s8, s8, s9 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s8 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s10 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s8 +; GCN-NEXT: s_mul_i32 s12, s3, s8 +; GCN-NEXT: s_add_u32 s9, s9, s12 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_mul_i32 s10, s7, s10 +; GCN-NEXT: s_addc_u32 s12, 0, s12 +; GCN-NEXT: v_readfirstlane_b32 s11, v4 +; GCN-NEXT: s_add_u32 s9, s9, s10 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: s_addc_u32 s9, s12, s11 +; GCN-NEXT: s_addc_u32 s10, s13, 0 +; GCN-NEXT: s_mul_i32 s8, s7, s8 +; GCN-NEXT: s_add_u32 s8, s9, s8 +; GCN-NEXT: s_addc_u32 s9, 0, s10 +; GCN-NEXT: s_add_u32 s8, s3, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 -; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-NEXT: s_or_b32 s6, s6, s7 -; GCN-NEXT: s_addc_u32 s9, s9, s10 -; GCN-NEXT: s_mul_i32 s6, s2, s9 -; GCN-NEXT: v_readfirstlane_b32 s7, v0 -; GCN-NEXT: s_add_i32 s6, s7, s6 -; GCN-NEXT: s_mul_i32 s8, s8, s11 -; GCN-NEXT: s_mul_i32 s2, s2, s11 -; GCN-NEXT: s_add_i32 s6, s6, s8 +; GCN-NEXT: s_addc_u32 s7, s7, s9 +; GCN-NEXT: s_mul_i32 s9, s2, s7 +; GCN-NEXT: s_mul_i32 s6, s6, s8 +; GCN-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-NEXT: s_add_i32 s9, s10, s9 +; GCN-NEXT: s_mul_i32 s2, s2, s8 +; GCN-NEXT: s_add_i32 s6, s9, s6 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mul_hi_u32 v3, s9, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s9, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 -; GCN-NEXT: s_mul_i32 s8, s11, s6 +; GCN-NEXT: v_mul_hi_u32 v3, s7, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s8, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s7, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s8, v0 +; GCN-NEXT: s_mul_i32 s10, s8, s6 ; GCN-NEXT: v_readfirstlane_b32 s12, v2 -; GCN-NEXT: s_add_u32 s8, s12, s8 -; GCN-NEXT: v_readfirstlane_b32 s10, v0 -; GCN-NEXT: s_mul_i32 s2, s9, s2 -; GCN-NEXT: s_addc_u32 s10, 0, s10 -; GCN-NEXT: v_readfirstlane_b32 s7, v3 -; GCN-NEXT: s_add_u32 s2, s8, s2 -; GCN-NEXT: s_addc_u32 s2, s10, s7 -; GCN-NEXT: v_readfirstlane_b32 s7, v1 -; GCN-NEXT: s_addc_u32 s7, s7, 0 -; GCN-NEXT: s_mul_i32 s6, s9, s6 +; GCN-NEXT: s_add_u32 s10, s12, s10 +; GCN-NEXT: v_readfirstlane_b32 s11, v0 +; GCN-NEXT: s_mul_i32 s2, s7, s2 +; GCN-NEXT: s_addc_u32 s11, 0, s11 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: s_add_u32 s2, s10, s2 +; GCN-NEXT: s_addc_u32 s2, s11, s9 +; GCN-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_mul_i32 s6, s7, s6 ; GCN-NEXT: s_add_u32 s2, s2, s6 -; GCN-NEXT: s_addc_u32 s8, 0, s7 -; GCN-NEXT: s_add_u32 s2, s11, s2 -; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-NEXT: s_or_b32 s6, s6, s7 -; GCN-NEXT: s_addc_u32 s6, s9, s8 +; GCN-NEXT: s_addc_u32 s6, 0, s9 +; GCN-NEXT: s_add_u32 s2, s8, s2 +; GCN-NEXT: s_addc_u32 s6, s7, s6 ; GCN-NEXT: v_mul_hi_u32 v1, s2, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s6, 24 ; GCN-NEXT: s_mul_i32 s6, s6, 24 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_readfirstlane_b32 s8, v1 ; GCN-NEXT: v_readfirstlane_b32 s7, v0 ; GCN-NEXT: s_add_u32 s6, s8, s6 @@ -1401,16 +1372,15 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 ; GCN-NEXT: s_mul_i32 s7, s5, s6 ; GCN-NEXT: s_mul_i32 s6, s4, s6 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_readfirstlane_b32 s8, v0 ; GCN-NEXT: s_add_i32 s10, s8, s7 ; GCN-NEXT: s_sub_i32 s8, 0, s10 ; GCN-NEXT: s_sub_u32 s11, 24, s6 ; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-NEXT: s_or_b32 s9, s6, s7 ; GCN-NEXT: s_subb_u32 s12, s8, s5 ; GCN-NEXT: s_sub_u32 s13, s11, s4 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s14, s8, s9 ; GCN-NEXT: s_subb_u32 s14, s12, 0 ; GCN-NEXT: s_cmp_ge_u32 s14, s5 ; GCN-NEXT: s_cselect_b32 s15, -1, 0 @@ -1419,13 +1389,11 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_cmp_eq_u32 s14, s5 ; GCN-NEXT: s_cselect_b32 s15, s16, s15 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s12, s12, s5 -; GCN-NEXT: s_sub_u32 s16, s13, s4 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s8, s12, 0 +; GCN-NEXT: s_subb_u32 s8, s12, s5 +; GCN-NEXT: s_sub_u32 s9, s13, s4 +; GCN-NEXT: s_subb_u32 s8, s8, 0 ; GCN-NEXT: s_cmp_lg_u32 s15, 0 -; GCN-NEXT: s_cselect_b32 s9, s16, s13 +; GCN-NEXT: s_cselect_b32 s9, s9, s13 ; GCN-NEXT: s_cselect_b32 s8, s8, s14 ; GCN-NEXT: s_or_b32 s6, s6, s7 ; GCN-NEXT: s_subb_u32 s6, 0, s10 @@ -1468,8 +1436,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_cbranch_vccz .LBB10_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s8, s2, 1 -; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-IR-NEXT: s_or_b32 s9, s10, s11 ; GCN-IR-NEXT: s_addc_u32 s3, s3, 0 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s2, 63, s2 @@ -1500,8 +1466,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_sub_u32 s10, s10, s16 ; GCN-IR-NEXT: s_subb_u32 s11, s11, s17 ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 -; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-IR-NEXT: s_or_b32 s16, s16, s17 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[6:7] @@ -1644,8 +1608,7 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3 -; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v8 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc5, v8 ; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] @@ -1834,8 +1797,7 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3 -; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v8 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 0xffffffd0, v8 ; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] |
