diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
61 files changed, 2622 insertions, 3200 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir index d9ac9a7..de1bb47 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck -check-prefix=GFX10 %s +# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck -check-prefix=GFX10 %s # Test that we fold correct element from G_UNMERGE_VALUES into fma diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir index 52b1beb..91f2f6f1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX10 -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX11 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX10 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX11 --- name: fract_f64_neg diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll index 5171403..7714c03 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll @@ -140,7 +140,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) { ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB9_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 @@ -345,7 +344,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) { ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 ; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1 ; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB17_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll index 7b01f13..7b81669 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll @@ -143,7 +143,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) { ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB9_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 @@ -348,7 +347,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) { ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0 ; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1 ; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB17_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 diff --git a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll index b72eba8..8088c1b 100644 --- a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll +++ b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll @@ -180,11 +180,7 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B ; CHECK-LABEL: s_add64_32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s0, s0, s2 -; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0 ; CHECK-NEXT: s_addc_u32 s1, s1, s3 -; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 ; CHECK-NEXT: s_addc_u32 s2, s4, 0 ; CHECK-NEXT: ; return to shader part epilog %sum64 = add i64 %val64A, %val64B @@ -199,14 +195,10 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) { ; CHECK-LABEL: s_uadd_v2i64: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s2, s6 -; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0 -; CHECK-NEXT: s_addc_u32 s8, s3, s7 +; CHECK-NEXT: s_add_u32 s6, s2, s6 +; CHECK-NEXT: s_addc_u32 s7, s3, s7 ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 ; CHECK-NEXT: s_add_u32 s0, s0, s4 -; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0 ; CHECK-NEXT: s_addc_u32 s1, s1, s5 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -215,8 +207,8 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v7 ; CHECK-NEXT: v_readfirstlane_b32 s2, v6 -; CHECK-NEXT: v_mov_b32_e32 v4, s10 -; CHECK-NEXT: v_mov_b32_e32 v5, s8 +; CHECK-NEXT: v_mov_b32_e32 v4, s6 +; CHECK-NEXT: v_mov_b32_e32 v5, s7 ; CHECK-NEXT: s_mov_b32 s1, s0 ; CHECK-NEXT: s_mov_b32 s3, s2 ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -233,14 +225,10 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) { ; CHECK-LABEL: s_usub_v2i64: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_sub_u32 s10, s2, s6 -; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0 -; CHECK-NEXT: s_subb_u32 s8, s3, s7 +; CHECK-NEXT: s_sub_u32 s6, s2, s6 +; CHECK-NEXT: s_subb_u32 s7, s3, s7 ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 ; CHECK-NEXT: s_sub_u32 s0, s0, s4 -; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0 ; CHECK-NEXT: s_subb_u32 s1, s1, s5 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -249,8 +237,8 @@ define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v7 ; CHECK-NEXT: v_readfirstlane_b32 s2, v6 -; CHECK-NEXT: v_mov_b32_e32 v4, s10 -; CHECK-NEXT: v_mov_b32_e32 v5, s8 +; CHECK-NEXT: v_mov_b32_e32 v4, s6 +; CHECK-NEXT: v_mov_b32_e32 v5, s7 ; CHECK-NEXT: s_mov_b32 s1, s0 ; CHECK-NEXT: s_mov_b32 s3, s2 ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -268,8 +256,6 @@ define amdgpu_ps i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) ; CHECK-LABEL: s_uadd_i64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s0, s0, s2 -; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0 ; CHECK-NEXT: s_addc_u32 s1, s1, s3 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -292,8 +278,6 @@ define amdgpu_ps i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) { ; CHECK-LABEL: s_uadd_p1: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s0, s0, 1 -; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -339,8 +323,6 @@ define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) { ; CHECK-LABEL: s_usub_p1: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_sub_u32 s0, s0, 1 -; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 ; CHECK-NEXT: s_subb_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -363,8 +345,6 @@ define amdgpu_ps i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) { ; CHECK-LABEL: s_usub_n1: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_sub_u32 s0, s0, -1 -; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 ; CHECK-NEXT: s_subb_u32 s1, s1, -1 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index 948811e..51df8c3 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -7821,10 +7821,9 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_addc_u32 s15, 0, s16 ; GFX6-NEXT: s_add_u32 s16, s0, s1 ; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_addc_u32 s14, s14, s15 ; GFX6-NEXT: s_mul_i32 s0, s12, s14 ; GFX6-NEXT: v_readfirstlane_b32 s1, v0 @@ -7855,7 +7854,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_add_u32 s15, s16, s0 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_addc_u32 s14, s14, s12 ; GFX6-NEXT: s_ashr_i32 s12, s7, 31 ; GFX6-NEXT: s_add_u32 s0, s6, s12 @@ -7881,52 +7879,50 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: v_readfirstlane_b32 s4, v0 ; GFX6-NEXT: s_addc_u32 s4, s4, 0 ; GFX6-NEXT: s_mul_i32 s14, s7, s14 -; GFX6-NEXT: s_add_u32 s14, s1, s14 -; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: s_add_u32 s16, s1, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 ; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 -; GFX6-NEXT: s_addc_u32 s15, 0, s4 +; GFX6-NEXT: s_addc_u32 s17, 0, s4 ; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_mul_i32 s4, s10, s15 +; GFX6-NEXT: s_mul_i32 s4, s10, s17 ; GFX6-NEXT: v_readfirstlane_b32 s5, v0 ; GFX6-NEXT: s_add_i32 s4, s5, s4 -; GFX6-NEXT: s_mul_i32 s5, s11, s14 -; GFX6-NEXT: s_add_i32 s16, s4, s5 -; GFX6-NEXT: s_sub_i32 s17, s7, s16 -; GFX6-NEXT: s_mul_i32 s4, s10, s14 +; GFX6-NEXT: s_mul_i32 s5, s11, s16 +; GFX6-NEXT: s_add_i32 s18, s4, s5 +; GFX6-NEXT: s_sub_i32 s14, s7, s18 +; GFX6-NEXT: s_mul_i32 s4, s10, s16 ; GFX6-NEXT: s_sub_u32 s6, s6, s4 ; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX6-NEXT: s_or_b32 s18, s4, s5 -; GFX6-NEXT: s_cmp_lg_u32 s18, 0 -; GFX6-NEXT: s_subb_u32 s17, s17, s11 -; GFX6-NEXT: s_sub_u32 s19, s6, s10 -; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX6-NEXT: s_or_b32 s15, s4, s5 +; GFX6-NEXT: s_subb_u32 s19, s14, s11 +; GFX6-NEXT: s_sub_u32 s20, s6, s10 +; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX6-NEXT: s_or_b32 s14, s14, s15 +; GFX6-NEXT: s_subb_u32 s14, s19, 0 +; GFX6-NEXT: s_cmp_ge_u32 s14, s11 +; GFX6-NEXT: s_cselect_b32 s15, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s20, s10 +; GFX6-NEXT: s_cselect_b32 s19, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s14, s11 +; GFX6-NEXT: s_cselect_b32 s14, s19, s15 +; GFX6-NEXT: s_add_u32 s15, s16, 1 +; GFX6-NEXT: s_addc_u32 s19, s17, 0 +; GFX6-NEXT: s_add_u32 s20, s16, 2 +; GFX6-NEXT: s_addc_u32 s21, s17, 0 +; GFX6-NEXT: s_cmp_lg_u32 s14, 0 +; GFX6-NEXT: s_cselect_b32 s14, s20, s15 +; GFX6-NEXT: s_cselect_b32 s15, s21, s19 ; GFX6-NEXT: s_or_b32 s4, s4, s5 -; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_subb_u32 s4, s17, 0 +; GFX6-NEXT: s_subb_u32 s4, s7, s18 ; GFX6-NEXT: s_cmp_ge_u32 s4, s11 ; GFX6-NEXT: s_cselect_b32 s5, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s19, s10 -; GFX6-NEXT: s_cselect_b32 s17, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s4, s11 -; GFX6-NEXT: s_cselect_b32 s4, s17, s5 -; GFX6-NEXT: s_add_u32 s5, s14, 1 -; GFX6-NEXT: s_addc_u32 s17, s15, 0 -; GFX6-NEXT: s_add_u32 s19, s14, 2 -; GFX6-NEXT: s_addc_u32 s20, s15, 0 -; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_cselect_b32 s4, s19, s5 -; GFX6-NEXT: s_cselect_b32 s5, s20, s17 -; GFX6-NEXT: s_cmp_lg_u32 s18, 0 -; GFX6-NEXT: s_subb_u32 s7, s7, s16 -; GFX6-NEXT: s_cmp_ge_u32 s7, s11 -; GFX6-NEXT: s_cselect_b32 s16, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s6, s10 ; GFX6-NEXT: s_cselect_b32 s6, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s7, s11 -; GFX6-NEXT: s_cselect_b32 s6, s6, s16 -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_cselect_b32 s5, s5, s15 -; GFX6-NEXT: s_cselect_b32 s4, s4, s14 +; GFX6-NEXT: s_cmp_eq_u32 s4, s11 +; GFX6-NEXT: s_cselect_b32 s4, s6, s5 +; GFX6-NEXT: s_cmp_lg_u32 s4, 0 +; GFX6-NEXT: s_cselect_b32 s5, s15, s17 +; GFX6-NEXT: s_cselect_b32 s4, s14, s16 ; GFX6-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9] ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX6-NEXT: s_sub_u32 s4, s4, s6 @@ -7949,8 +7945,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_sub_u32 s10, 0, s8 -; GFX9-NEXT: s_subb_u32 s11, 0, s9 +; GFX9-NEXT: s_sub_u32 s4, 0, s8 +; GFX9-NEXT: s_subb_u32 s5, 0, s9 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -7960,56 +7956,52 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s12, v2 -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_mul_i32 s5, s10, s12 -; GFX9-NEXT: s_mul_hi_u32 s14, s10, s4 -; GFX9-NEXT: s_mul_i32 s13, s11, s4 -; GFX9-NEXT: s_add_i32 s5, s14, s5 -; GFX9-NEXT: s_mul_i32 s15, s10, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s13 -; GFX9-NEXT: s_mul_hi_u32 s14, s4, s15 -; GFX9-NEXT: s_mul_i32 s16, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s13, s4, s5 +; GFX9-NEXT: v_readfirstlane_b32 s10, v2 +; GFX9-NEXT: v_readfirstlane_b32 s11, v1 +; GFX9-NEXT: s_mul_i32 s12, s4, s10 +; GFX9-NEXT: s_mul_hi_u32 s14, s4, s11 +; GFX9-NEXT: s_mul_i32 s13, s5, s11 +; GFX9-NEXT: s_add_i32 s12, s14, s12 +; GFX9-NEXT: s_mul_i32 s15, s4, s11 +; GFX9-NEXT: s_add_i32 s12, s12, s13 +; GFX9-NEXT: s_mul_hi_u32 s14, s11, s15 +; GFX9-NEXT: s_mul_i32 s16, s11, s12 +; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12 ; GFX9-NEXT: s_add_u32 s14, s14, s16 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15 -; GFX9-NEXT: s_mul_i32 s15, s12, s15 +; GFX9-NEXT: s_mul_hi_u32 s17, s10, s15 +; GFX9-NEXT: s_mul_i32 s15, s10, s15 ; GFX9-NEXT: s_add_u32 s14, s14, s15 -; GFX9-NEXT: s_mul_hi_u32 s16, s12, s5 +; GFX9-NEXT: s_mul_hi_u32 s16, s10, s12 ; GFX9-NEXT: s_addc_u32 s13, s13, s17 ; GFX9-NEXT: s_addc_u32 s14, s16, 0 -; GFX9-NEXT: s_mul_i32 s5, s12, s5 -; GFX9-NEXT: s_add_u32 s5, s13, s5 +; GFX9-NEXT: s_mul_i32 s12, s10, s12 +; GFX9-NEXT: s_add_u32 s12, s13, s12 ; GFX9-NEXT: s_addc_u32 s13, 0, s14 -; GFX9-NEXT: s_add_u32 s14, s4, s5 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s12, s12, s13 -; GFX9-NEXT: s_mul_i32 s4, s10, s12 -; GFX9-NEXT: s_mul_hi_u32 s5, s10, s14 -; GFX9-NEXT: s_add_i32 s4, s5, s4 -; GFX9-NEXT: s_mul_i32 s11, s11, s14 -; GFX9-NEXT: s_add_i32 s4, s4, s11 -; GFX9-NEXT: s_mul_i32 s10, s10, s14 -; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10 -; GFX9-NEXT: s_mul_i32 s13, s12, s10 -; GFX9-NEXT: s_mul_i32 s16, s14, s4 -; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10 -; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4 -; GFX9-NEXT: s_add_u32 s10, s10, s16 +; GFX9-NEXT: s_add_u32 s11, s11, s12 +; GFX9-NEXT: s_addc_u32 s10, s10, s13 +; GFX9-NEXT: s_mul_i32 s12, s4, s10 +; GFX9-NEXT: s_mul_hi_u32 s13, s4, s11 +; GFX9-NEXT: s_add_i32 s12, s13, s12 +; GFX9-NEXT: s_mul_i32 s5, s5, s11 +; GFX9-NEXT: s_add_i32 s12, s12, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s11 +; GFX9-NEXT: s_mul_hi_u32 s13, s10, s4 +; GFX9-NEXT: s_mul_i32 s14, s10, s4 +; GFX9-NEXT: s_mul_i32 s16, s11, s12 +; GFX9-NEXT: s_mul_hi_u32 s4, s11, s4 +; GFX9-NEXT: s_mul_hi_u32 s15, s11, s12 +; GFX9-NEXT: s_add_u32 s4, s4, s16 ; GFX9-NEXT: s_addc_u32 s15, 0, s15 -; GFX9-NEXT: s_add_u32 s10, s10, s13 -; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4 -; GFX9-NEXT: s_addc_u32 s10, s15, s11 +; GFX9-NEXT: s_add_u32 s4, s4, s14 +; GFX9-NEXT: s_mul_hi_u32 s5, s10, s12 +; GFX9-NEXT: s_addc_u32 s4, s15, s13 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_mul_i32 s4, s12, s4 -; GFX9-NEXT: s_add_u32 s4, s10, s4 -; GFX9-NEXT: s_addc_u32 s10, 0, s5 -; GFX9-NEXT: s_add_u32 s11, s14, s4 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s10, s12, s10 +; GFX9-NEXT: s_mul_i32 s12, s10, s12 +; GFX9-NEXT: s_add_u32 s4, s4, s12 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_add_u32 s11, s11, s4 +; GFX9-NEXT: s_addc_u32 s10, s10, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: s_add_u32 s2, s2, s4 @@ -8028,38 +8020,35 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_addc_u32 s11, s12, s15 ; GFX9-NEXT: s_addc_u32 s12, s14, 0 ; GFX9-NEXT: s_mul_i32 s10, s3, s10 -; GFX9-NEXT: s_add_u32 s14, s11, s10 -; GFX9-NEXT: s_addc_u32 s15, 0, s12 -; GFX9-NEXT: s_mul_i32 s10, s8, s15 -; GFX9-NEXT: s_mul_hi_u32 s11, s8, s14 +; GFX9-NEXT: s_add_u32 s13, s11, s10 +; GFX9-NEXT: s_addc_u32 s12, 0, s12 +; GFX9-NEXT: s_mul_i32 s10, s8, s12 +; GFX9-NEXT: s_mul_hi_u32 s11, s8, s13 ; GFX9-NEXT: s_add_i32 s10, s11, s10 -; GFX9-NEXT: s_mul_i32 s11, s9, s14 -; GFX9-NEXT: s_add_i32 s16, s10, s11 -; GFX9-NEXT: s_sub_i32 s12, s3, s16 -; GFX9-NEXT: s_mul_i32 s10, s8, s14 +; GFX9-NEXT: s_mul_i32 s11, s9, s13 +; GFX9-NEXT: s_add_i32 s14, s10, s11 +; GFX9-NEXT: s_sub_i32 s15, s3, s14 +; GFX9-NEXT: s_mul_i32 s10, s8, s13 ; GFX9-NEXT: s_sub_u32 s2, s2, s10 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s17, s12, s9 -; GFX9-NEXT: s_sub_u32 s18, s2, s8 -; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GFX9-NEXT: s_subb_u32 s12, s17, 0 -; GFX9-NEXT: s_cmp_ge_u32 s12, s9 -; GFX9-NEXT: s_cselect_b32 s13, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s18, s8 +; GFX9-NEXT: s_subb_u32 s15, s15, s9 +; GFX9-NEXT: s_sub_u32 s16, s2, s8 +; GFX9-NEXT: s_subb_u32 s15, s15, 0 +; GFX9-NEXT: s_cmp_ge_u32 s15, s9 ; GFX9-NEXT: s_cselect_b32 s17, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s12, s9 -; GFX9-NEXT: s_cselect_b32 s12, s17, s13 -; GFX9-NEXT: s_add_u32 s13, s14, 1 -; GFX9-NEXT: s_addc_u32 s17, s15, 0 -; GFX9-NEXT: s_add_u32 s18, s14, 2 -; GFX9-NEXT: s_addc_u32 s19, s15, 0 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 -; GFX9-NEXT: s_cselect_b32 s12, s18, s13 -; GFX9-NEXT: s_cselect_b32 s13, s19, s17 +; GFX9-NEXT: s_cmp_ge_u32 s16, s8 +; GFX9-NEXT: s_cselect_b32 s16, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s15, s9 +; GFX9-NEXT: s_cselect_b32 s15, s16, s17 +; GFX9-NEXT: s_add_u32 s16, s13, 1 +; GFX9-NEXT: s_addc_u32 s17, s12, 0 +; GFX9-NEXT: s_add_u32 s18, s13, 2 +; GFX9-NEXT: s_addc_u32 s19, s12, 0 +; GFX9-NEXT: s_cmp_lg_u32 s15, 0 +; GFX9-NEXT: s_cselect_b32 s15, s18, s16 +; GFX9-NEXT: s_cselect_b32 s16, s19, s17 ; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s3, s3, s16 +; GFX9-NEXT: s_subb_u32 s3, s3, s14 ; GFX9-NEXT: s_cmp_ge_u32 s3, s9 ; GFX9-NEXT: s_cselect_b32 s10, -1, 0 ; GFX9-NEXT: s_cmp_ge_u32 s2, s8 @@ -8067,8 +8056,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_cmp_eq_u32 s3, s9 ; GFX9-NEXT: s_cselect_b32 s2, s2, s10 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s3, s13, s15 -; GFX9-NEXT: s_cselect_b32 s2, s12, s14 +; GFX9-NEXT: s_cselect_b32 s3, s16, s12 +; GFX9-NEXT: s_cselect_b32 s2, s15, s13 ; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] ; GFX9-NEXT: s_sub_u32 s2, s2, s4 @@ -8328,10 +8317,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_addc_u32 s17, 0, s18 ; GFX6-NEXT: s_add_u32 s18, s12, s13 ; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_addc_u32 s16, s16, s17 ; GFX6-NEXT: s_mul_i32 s12, s14, s16 ; GFX6-NEXT: v_readfirstlane_b32 s13, v0 @@ -8362,7 +8350,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_add_u32 s15, s18, s12 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_addc_u32 s14, s16, s14 ; GFX6-NEXT: s_ashr_i32 s12, s9, 31 ; GFX6-NEXT: s_add_u32 s8, s8, s12 @@ -8387,55 +8374,53 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 ; GFX6-NEXT: s_addc_u32 s16, s16, 0 ; GFX6-NEXT: s_mul_i32 s14, s9, s14 -; GFX6-NEXT: s_add_u32 s17, s15, s14 -; GFX6-NEXT: v_mov_b32_e32 v0, s17 +; GFX6-NEXT: s_add_u32 s18, s15, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s18 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX6-NEXT: s_addc_u32 s16, 0, s16 -; GFX6-NEXT: s_mul_i32 s14, s6, s16 +; GFX6-NEXT: s_addc_u32 s19, 0, s16 +; GFX6-NEXT: s_mul_i32 s14, s6, s19 ; GFX6-NEXT: v_readfirstlane_b32 s15, v0 ; GFX6-NEXT: s_add_i32 s14, s15, s14 -; GFX6-NEXT: s_mul_i32 s15, s7, s17 -; GFX6-NEXT: s_add_i32 s18, s14, s15 -; GFX6-NEXT: s_sub_i32 s19, s9, s18 -; GFX6-NEXT: s_mul_i32 s14, s6, s17 +; GFX6-NEXT: s_mul_i32 s15, s7, s18 +; GFX6-NEXT: s_add_i32 s20, s14, s15 +; GFX6-NEXT: s_sub_i32 s16, s9, s20 +; GFX6-NEXT: s_mul_i32 s14, s6, s18 ; GFX6-NEXT: s_sub_u32 s8, s8, s14 ; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s20, s14, s15 -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_subb_u32 s19, s19, s7 -; GFX6-NEXT: s_sub_u32 s21, s8, s6 -; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX6-NEXT: s_or_b32 s17, s14, s15 +; GFX6-NEXT: s_subb_u32 s21, s16, s7 +; GFX6-NEXT: s_sub_u32 s22, s8, s6 +; GFX6-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GFX6-NEXT: s_or_b32 s16, s16, s17 +; GFX6-NEXT: s_subb_u32 s16, s21, 0 +; GFX6-NEXT: s_cmp_ge_u32 s16, s7 +; GFX6-NEXT: s_cselect_b32 s17, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s22, s6 +; GFX6-NEXT: s_cselect_b32 s21, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s16, s7 +; GFX6-NEXT: s_cselect_b32 s16, s21, s17 +; GFX6-NEXT: s_add_u32 s17, s18, 1 +; GFX6-NEXT: s_addc_u32 s21, s19, 0 +; GFX6-NEXT: s_add_u32 s22, s18, 2 +; GFX6-NEXT: s_addc_u32 s23, s19, 0 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_cselect_b32 s16, s22, s17 +; GFX6-NEXT: s_cselect_b32 s17, s23, s21 ; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_cmp_lg_u32 s14, 0 -; GFX6-NEXT: s_subb_u32 s14, s19, 0 -; GFX6-NEXT: s_cmp_ge_u32 s14, s7 -; GFX6-NEXT: s_cselect_b32 s15, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s21, s6 -; GFX6-NEXT: s_cselect_b32 s19, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s14, s7 -; GFX6-NEXT: s_cselect_b32 s14, s19, s15 -; GFX6-NEXT: s_add_u32 s15, s17, 1 -; GFX6-NEXT: s_addc_u32 s19, s16, 0 -; GFX6-NEXT: s_add_u32 s21, s17, 2 -; GFX6-NEXT: s_addc_u32 s22, s16, 0 -; GFX6-NEXT: s_cmp_lg_u32 s14, 0 -; GFX6-NEXT: s_cselect_b32 s14, s21, s15 -; GFX6-NEXT: s_cselect_b32 s15, s22, s19 -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_subb_u32 s9, s9, s18 +; GFX6-NEXT: s_subb_u32 s9, s9, s20 ; GFX6-NEXT: s_cmp_ge_u32 s9, s7 -; GFX6-NEXT: s_cselect_b32 s18, -1, 0 +; GFX6-NEXT: s_cselect_b32 s14, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s8, s6 ; GFX6-NEXT: s_cselect_b32 s6, -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s9, s7 -; GFX6-NEXT: s_cselect_b32 s6, s6, s18 +; GFX6-NEXT: s_cselect_b32 s6, s6, s14 ; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_cselect_b32 s7, s15, s16 -; GFX6-NEXT: s_cselect_b32 s6, s14, s17 +; GFX6-NEXT: s_cselect_b32 s7, s17, s19 +; GFX6-NEXT: s_cselect_b32 s6, s16, s18 ; GFX6-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3] ; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] -; GFX6-NEXT: s_sub_u32 s14, s6, s2 -; GFX6-NEXT: s_subb_u32 s15, s7, s3 +; GFX6-NEXT: s_sub_u32 s16, s6, s2 +; GFX6-NEXT: s_subb_u32 s17, s7, s3 ; GFX6-NEXT: s_ashr_i32 s6, s1, 31 ; GFX6-NEXT: s_add_u32 s0, s0, s6 ; GFX6-NEXT: s_mov_b32 s7, s6 @@ -8454,40 +8439,39 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0 -; GFX6-NEXT: v_readfirstlane_b32 s16, v1 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s1, s12, s16 +; GFX6-NEXT: s_mul_i32 s1, s12, s14 ; GFX6-NEXT: v_readfirstlane_b32 s3, v2 ; GFX6-NEXT: s_mul_i32 s0, s13, s2 ; GFX6-NEXT: s_add_i32 s1, s3, s1 ; GFX6-NEXT: s_add_i32 s3, s1, s0 -; GFX6-NEXT: s_mul_i32 s17, s12, s2 +; GFX6-NEXT: s_mul_i32 s15, s12, s2 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s17 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mul_i32 s4, s2, s3 ; GFX6-NEXT: v_readfirstlane_b32 s5, v2 ; GFX6-NEXT: v_readfirstlane_b32 s18, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, v1, s17 +; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15 ; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3 ; GFX6-NEXT: s_add_u32 s4, s18, s4 ; GFX6-NEXT: s_addc_u32 s5, 0, s5 -; GFX6-NEXT: s_mul_i32 s17, s16, s17 +; GFX6-NEXT: s_mul_i32 s15, s14, s15 ; GFX6-NEXT: v_readfirstlane_b32 s18, v0 -; GFX6-NEXT: s_add_u32 s4, s4, s17 +; GFX6-NEXT: s_add_u32 s4, s4, s15 ; GFX6-NEXT: s_addc_u32 s4, s5, s18 ; GFX6-NEXT: v_readfirstlane_b32 s5, v1 ; GFX6-NEXT: s_addc_u32 s5, s5, 0 -; GFX6-NEXT: s_mul_i32 s3, s16, s3 +; GFX6-NEXT: s_mul_i32 s3, s14, s3 ; GFX6-NEXT: s_add_u32 s3, s4, s3 ; GFX6-NEXT: s_addc_u32 s4, 0, s5 ; GFX6-NEXT: s_add_u32 s5, s2, s3 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_addc_u32 s4, s16, s4 +; GFX6-NEXT: s_addc_u32 s4, s14, s4 ; GFX6-NEXT: s_mul_i32 s2, s12, s4 ; GFX6-NEXT: v_readfirstlane_b32 s3, v0 ; GFX6-NEXT: s_add_i32 s2, s3, s2 @@ -8501,14 +8485,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: s_mul_i32 s13, s5, s2 -; GFX6-NEXT: v_readfirstlane_b32 s17, v2 -; GFX6-NEXT: s_add_u32 s13, s17, s13 -; GFX6-NEXT: v_readfirstlane_b32 s16, v0 +; GFX6-NEXT: v_readfirstlane_b32 s15, v2 +; GFX6-NEXT: s_add_u32 s13, s15, s13 +; GFX6-NEXT: v_readfirstlane_b32 s14, v0 ; GFX6-NEXT: s_mul_i32 s3, s4, s3 -; GFX6-NEXT: s_addc_u32 s16, 0, s16 +; GFX6-NEXT: s_addc_u32 s14, 0, s14 ; GFX6-NEXT: v_readfirstlane_b32 s12, v3 ; GFX6-NEXT: s_add_u32 s3, s13, s3 -; GFX6-NEXT: s_addc_u32 s3, s16, s12 +; GFX6-NEXT: s_addc_u32 s3, s14, s12 ; GFX6-NEXT: v_readfirstlane_b32 s12, v1 ; GFX6-NEXT: s_addc_u32 s12, s12, 0 ; GFX6-NEXT: s_mul_i32 s2, s4, s2 @@ -8517,7 +8501,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_add_u32 s13, s5, s2 ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: s_addc_u32 s12, s4, s12 ; GFX6-NEXT: s_ashr_i32 s4, s11, 31 ; GFX6-NEXT: s_add_u32 s2, s10, s4 @@ -8529,72 +8512,70 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mov_b32_e32 v2, s13 ; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2 ; GFX6-NEXT: s_mul_i32 s2, s10, s12 -; GFX6-NEXT: v_readfirstlane_b32 s16, v1 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s11, v2 -; GFX6-NEXT: v_readfirstlane_b32 s17, v3 +; GFX6-NEXT: v_readfirstlane_b32 s15, v3 ; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 -; GFX6-NEXT: s_add_u32 s2, s17, s2 -; GFX6-NEXT: s_addc_u32 s16, 0, s16 +; GFX6-NEXT: s_add_u32 s2, s15, s2 +; GFX6-NEXT: s_addc_u32 s14, 0, s14 ; GFX6-NEXT: s_mul_i32 s13, s11, s13 -; GFX6-NEXT: v_readfirstlane_b32 s17, v1 +; GFX6-NEXT: v_readfirstlane_b32 s15, v1 ; GFX6-NEXT: s_add_u32 s2, s2, s13 -; GFX6-NEXT: s_addc_u32 s2, s16, s17 +; GFX6-NEXT: s_addc_u32 s2, s14, s15 ; GFX6-NEXT: v_readfirstlane_b32 s13, v0 ; GFX6-NEXT: s_addc_u32 s13, s13, 0 ; GFX6-NEXT: s_mul_i32 s12, s11, s12 -; GFX6-NEXT: s_add_u32 s16, s2, s12 -; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: s_add_u32 s18, s2, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s18 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: s_addc_u32 s17, 0, s13 -; GFX6-NEXT: s_mul_i32 s12, s8, s17 +; GFX6-NEXT: s_addc_u32 s19, 0, s13 +; GFX6-NEXT: s_mul_i32 s12, s8, s19 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_readfirstlane_b32 s13, v0 ; GFX6-NEXT: s_add_i32 s12, s13, s12 -; GFX6-NEXT: s_mul_i32 s13, s9, s16 -; GFX6-NEXT: s_add_i32 s18, s12, s13 -; GFX6-NEXT: s_sub_i32 s19, s11, s18 -; GFX6-NEXT: s_mul_i32 s12, s8, s16 +; GFX6-NEXT: s_mul_i32 s13, s9, s18 +; GFX6-NEXT: s_add_i32 s20, s12, s13 +; GFX6-NEXT: s_sub_i32 s14, s11, s20 +; GFX6-NEXT: s_mul_i32 s12, s8, s18 ; GFX6-NEXT: s_sub_u32 s10, s10, s12 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s20, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_subb_u32 s19, s19, s9 -; GFX6-NEXT: s_sub_u32 s21, s10, s8 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s15, s12, s13 +; GFX6-NEXT: s_subb_u32 s21, s14, s9 +; GFX6-NEXT: s_sub_u32 s22, s10, s8 +; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX6-NEXT: s_or_b32 s14, s14, s15 +; GFX6-NEXT: s_subb_u32 s14, s21, 0 +; GFX6-NEXT: s_cmp_ge_u32 s14, s9 +; GFX6-NEXT: s_cselect_b32 s15, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s22, s8 +; GFX6-NEXT: s_cselect_b32 s21, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s14, s9 +; GFX6-NEXT: s_cselect_b32 s14, s21, s15 +; GFX6-NEXT: s_add_u32 s15, s18, 1 +; GFX6-NEXT: s_addc_u32 s21, s19, 0 +; GFX6-NEXT: s_add_u32 s22, s18, 2 +; GFX6-NEXT: s_addc_u32 s23, s19, 0 +; GFX6-NEXT: s_cmp_lg_u32 s14, 0 +; GFX6-NEXT: s_cselect_b32 s14, s22, s15 +; GFX6-NEXT: s_cselect_b32 s15, s23, s21 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_subb_u32 s12, s19, 0 -; GFX6-NEXT: s_cmp_ge_u32 s12, s9 -; GFX6-NEXT: s_cselect_b32 s13, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s21, s8 -; GFX6-NEXT: s_cselect_b32 s19, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s12, s9 -; GFX6-NEXT: s_cselect_b32 s12, s19, s13 -; GFX6-NEXT: s_add_u32 s13, s16, 1 -; GFX6-NEXT: s_addc_u32 s19, s17, 0 -; GFX6-NEXT: s_add_u32 s21, s16, 2 -; GFX6-NEXT: s_addc_u32 s22, s17, 0 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_cselect_b32 s12, s21, s13 -; GFX6-NEXT: s_cselect_b32 s13, s22, s19 -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_subb_u32 s11, s11, s18 +; GFX6-NEXT: s_subb_u32 s11, s11, s20 ; GFX6-NEXT: s_cmp_ge_u32 s11, s9 -; GFX6-NEXT: s_cselect_b32 s18, -1, 0 +; GFX6-NEXT: s_cselect_b32 s12, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s10, s8 ; GFX6-NEXT: s_cselect_b32 s8, -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s11, s9 -; GFX6-NEXT: s_cselect_b32 s8, s8, s18 +; GFX6-NEXT: s_cselect_b32 s8, s8, s12 ; GFX6-NEXT: s_cmp_lg_u32 s8, 0 -; GFX6-NEXT: s_cselect_b32 s9, s13, s17 -; GFX6-NEXT: s_cselect_b32 s8, s12, s16 +; GFX6-NEXT: s_cselect_b32 s9, s15, s19 +; GFX6-NEXT: s_cselect_b32 s8, s14, s18 ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX6-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5] ; GFX6-NEXT: s_sub_u32 s4, s6, s4 ; GFX6-NEXT: s_subb_u32 s5, s7, s5 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v0, s14 -; GFX6-NEXT: v_mov_b32_e32 v1, s15 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: v_mov_b32_e32 v1, s17 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8614,8 +8595,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_u32 s14, 0, s6 -; GFX9-NEXT: s_subb_u32 s15, 0, s7 +; GFX9-NEXT: s_sub_u32 s12, 0, s6 +; GFX9-NEXT: s_subb_u32 s13, 0, s7 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -8624,56 +8605,52 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s16, v1 -; GFX9-NEXT: v_readfirstlane_b32 s12, v0 -; GFX9-NEXT: s_mul_i32 s13, s14, s16 -; GFX9-NEXT: s_mul_hi_u32 s18, s14, s12 -; GFX9-NEXT: s_mul_i32 s17, s15, s12 -; GFX9-NEXT: s_add_i32 s13, s18, s13 -; GFX9-NEXT: s_mul_i32 s19, s14, s12 -; GFX9-NEXT: s_add_i32 s13, s13, s17 -; GFX9-NEXT: s_mul_hi_u32 s18, s12, s19 -; GFX9-NEXT: s_mul_i32 s20, s12, s13 -; GFX9-NEXT: s_mul_hi_u32 s17, s12, s13 +; GFX9-NEXT: v_readfirstlane_b32 s14, v1 +; GFX9-NEXT: v_readfirstlane_b32 s15, v0 +; GFX9-NEXT: s_mul_i32 s16, s12, s14 +; GFX9-NEXT: s_mul_hi_u32 s18, s12, s15 +; GFX9-NEXT: s_mul_i32 s17, s13, s15 +; GFX9-NEXT: s_add_i32 s16, s18, s16 +; GFX9-NEXT: s_mul_i32 s19, s12, s15 +; GFX9-NEXT: s_add_i32 s16, s16, s17 +; GFX9-NEXT: s_mul_hi_u32 s18, s15, s19 +; GFX9-NEXT: s_mul_i32 s20, s15, s16 +; GFX9-NEXT: s_mul_hi_u32 s17, s15, s16 ; GFX9-NEXT: s_add_u32 s18, s18, s20 ; GFX9-NEXT: s_addc_u32 s17, 0, s17 -; GFX9-NEXT: s_mul_hi_u32 s20, s16, s19 -; GFX9-NEXT: s_mul_i32 s19, s16, s19 +; GFX9-NEXT: s_mul_hi_u32 s20, s14, s19 +; GFX9-NEXT: s_mul_i32 s19, s14, s19 ; GFX9-NEXT: s_add_u32 s18, s18, s19 -; GFX9-NEXT: s_mul_hi_u32 s21, s16, s13 +; GFX9-NEXT: s_mul_hi_u32 s21, s14, s16 ; GFX9-NEXT: s_addc_u32 s17, s17, s20 ; GFX9-NEXT: s_addc_u32 s18, s21, 0 -; GFX9-NEXT: s_mul_i32 s13, s16, s13 -; GFX9-NEXT: s_add_u32 s13, s17, s13 +; GFX9-NEXT: s_mul_i32 s16, s14, s16 +; GFX9-NEXT: s_add_u32 s16, s17, s16 ; GFX9-NEXT: s_addc_u32 s17, 0, s18 -; GFX9-NEXT: s_add_u32 s18, s12, s13 -; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GFX9-NEXT: s_addc_u32 s16, s16, s17 -; GFX9-NEXT: s_mul_i32 s12, s14, s16 -; GFX9-NEXT: s_mul_hi_u32 s13, s14, s18 -; GFX9-NEXT: s_add_i32 s12, s13, s12 -; GFX9-NEXT: s_mul_i32 s15, s15, s18 -; GFX9-NEXT: s_add_i32 s12, s12, s15 -; GFX9-NEXT: s_mul_i32 s14, s14, s18 -; GFX9-NEXT: s_mul_hi_u32 s15, s16, s14 -; GFX9-NEXT: s_mul_i32 s17, s16, s14 -; GFX9-NEXT: s_mul_i32 s20, s18, s12 -; GFX9-NEXT: s_mul_hi_u32 s14, s18, s14 -; GFX9-NEXT: s_mul_hi_u32 s19, s18, s12 -; GFX9-NEXT: s_add_u32 s14, s14, s20 +; GFX9-NEXT: s_add_u32 s15, s15, s16 +; GFX9-NEXT: s_addc_u32 s14, s14, s17 +; GFX9-NEXT: s_mul_i32 s16, s12, s14 +; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15 +; GFX9-NEXT: s_add_i32 s16, s17, s16 +; GFX9-NEXT: s_mul_i32 s13, s13, s15 +; GFX9-NEXT: s_add_i32 s16, s16, s13 +; GFX9-NEXT: s_mul_i32 s12, s12, s15 +; GFX9-NEXT: s_mul_hi_u32 s17, s14, s12 +; GFX9-NEXT: s_mul_i32 s18, s14, s12 +; GFX9-NEXT: s_mul_i32 s20, s15, s16 +; GFX9-NEXT: s_mul_hi_u32 s12, s15, s12 +; GFX9-NEXT: s_mul_hi_u32 s19, s15, s16 +; GFX9-NEXT: s_add_u32 s12, s12, s20 ; GFX9-NEXT: s_addc_u32 s19, 0, s19 -; GFX9-NEXT: s_add_u32 s14, s14, s17 -; GFX9-NEXT: s_mul_hi_u32 s13, s16, s12 -; GFX9-NEXT: s_addc_u32 s14, s19, s15 +; GFX9-NEXT: s_add_u32 s12, s12, s18 +; GFX9-NEXT: s_mul_hi_u32 s13, s14, s16 +; GFX9-NEXT: s_addc_u32 s12, s19, s17 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NEXT: s_mul_i32 s12, s16, s12 -; GFX9-NEXT: s_add_u32 s12, s14, s12 -; GFX9-NEXT: s_addc_u32 s14, 0, s13 -; GFX9-NEXT: s_add_u32 s15, s18, s12 -; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GFX9-NEXT: s_addc_u32 s14, s16, s14 +; GFX9-NEXT: s_mul_i32 s16, s14, s16 +; GFX9-NEXT: s_add_u32 s12, s12, s16 +; GFX9-NEXT: s_addc_u32 s13, 0, s13 +; GFX9-NEXT: s_add_u32 s15, s15, s12 +; GFX9-NEXT: s_addc_u32 s14, s14, s13 ; GFX9-NEXT: s_ashr_i32 s12, s9, 31 ; GFX9-NEXT: s_add_u32 s8, s8, s12 ; GFX9-NEXT: s_mov_b32 s13, s12 @@ -8691,38 +8668,35 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_addc_u32 s15, s16, s19 ; GFX9-NEXT: s_addc_u32 s16, s18, 0 ; GFX9-NEXT: s_mul_i32 s14, s9, s14 -; GFX9-NEXT: s_add_u32 s18, s15, s14 -; GFX9-NEXT: s_addc_u32 s19, 0, s16 -; GFX9-NEXT: s_mul_i32 s14, s6, s19 -; GFX9-NEXT: s_mul_hi_u32 s15, s6, s18 +; GFX9-NEXT: s_add_u32 s17, s15, s14 +; GFX9-NEXT: s_addc_u32 s16, 0, s16 +; GFX9-NEXT: s_mul_i32 s14, s6, s16 +; GFX9-NEXT: s_mul_hi_u32 s15, s6, s17 ; GFX9-NEXT: s_add_i32 s14, s15, s14 -; GFX9-NEXT: s_mul_i32 s15, s7, s18 -; GFX9-NEXT: s_add_i32 s20, s14, s15 -; GFX9-NEXT: s_sub_i32 s16, s9, s20 -; GFX9-NEXT: s_mul_i32 s14, s6, s18 +; GFX9-NEXT: s_mul_i32 s15, s7, s17 +; GFX9-NEXT: s_add_i32 s18, s14, s15 +; GFX9-NEXT: s_sub_i32 s19, s9, s18 +; GFX9-NEXT: s_mul_i32 s14, s6, s17 ; GFX9-NEXT: s_sub_u32 s8, s8, s14 ; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GFX9-NEXT: s_subb_u32 s21, s16, s7 -; GFX9-NEXT: s_sub_u32 s22, s8, s6 -; GFX9-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[16:17], 0 -; GFX9-NEXT: s_subb_u32 s16, s21, 0 -; GFX9-NEXT: s_cmp_ge_u32 s16, s7 -; GFX9-NEXT: s_cselect_b32 s17, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s22, s6 +; GFX9-NEXT: s_subb_u32 s19, s19, s7 +; GFX9-NEXT: s_sub_u32 s20, s8, s6 +; GFX9-NEXT: s_subb_u32 s19, s19, 0 +; GFX9-NEXT: s_cmp_ge_u32 s19, s7 ; GFX9-NEXT: s_cselect_b32 s21, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s16, s7 -; GFX9-NEXT: s_cselect_b32 s16, s21, s17 -; GFX9-NEXT: s_add_u32 s17, s18, 1 -; GFX9-NEXT: s_addc_u32 s21, s19, 0 -; GFX9-NEXT: s_add_u32 s22, s18, 2 -; GFX9-NEXT: s_addc_u32 s23, s19, 0 -; GFX9-NEXT: s_cmp_lg_u32 s16, 0 -; GFX9-NEXT: s_cselect_b32 s16, s22, s17 -; GFX9-NEXT: s_cselect_b32 s17, s23, s21 +; GFX9-NEXT: s_cmp_ge_u32 s20, s6 +; GFX9-NEXT: s_cselect_b32 s20, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s19, s7 +; GFX9-NEXT: s_cselect_b32 s19, s20, s21 +; GFX9-NEXT: s_add_u32 s20, s17, 1 +; GFX9-NEXT: s_addc_u32 s21, s16, 0 +; GFX9-NEXT: s_add_u32 s22, s17, 2 +; GFX9-NEXT: s_addc_u32 s23, s16, 0 +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cselect_b32 s19, s22, s20 +; GFX9-NEXT: s_cselect_b32 s20, s23, s21 ; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GFX9-NEXT: s_subb_u32 s9, s9, s20 +; GFX9-NEXT: s_subb_u32 s9, s9, s18 ; GFX9-NEXT: s_cmp_ge_u32 s9, s7 ; GFX9-NEXT: s_cselect_b32 s14, -1, 0 ; GFX9-NEXT: s_cmp_ge_u32 s8, s6 @@ -8730,12 +8704,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cmp_eq_u32 s9, s7 ; GFX9-NEXT: s_cselect_b32 s6, s6, s14 ; GFX9-NEXT: s_cmp_lg_u32 s6, 0 -; GFX9-NEXT: s_cselect_b32 s7, s17, s19 -; GFX9-NEXT: s_cselect_b32 s6, s16, s18 +; GFX9-NEXT: s_cselect_b32 s7, s20, s16 +; GFX9-NEXT: s_cselect_b32 s6, s19, s17 ; GFX9-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3] ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] -; GFX9-NEXT: s_sub_u32 s14, s6, s2 -; GFX9-NEXT: s_subb_u32 s15, s7, s3 +; GFX9-NEXT: s_sub_u32 s12, s6, s2 +; GFX9-NEXT: s_subb_u32 s13, s7, s3 ; GFX9-NEXT: s_ashr_i32 s2, s1, 31 ; GFX9-NEXT: s_add_u32 s0, s0, s2 ; GFX9-NEXT: s_mov_b32 s3, s2 @@ -8744,8 +8718,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_sub_u32 s8, 0, s6 -; GFX9-NEXT: s_subb_u32 s9, 0, s7 +; GFX9-NEXT: s_sub_u32 s4, 0, s6 +; GFX9-NEXT: s_subb_u32 s5, 0, s7 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -8755,105 +8729,98 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: v_readfirstlane_b32 s13, v2 -; GFX9-NEXT: s_mul_hi_u32 s12, s8, s4 -; GFX9-NEXT: s_mul_i32 s16, s8, s13 -; GFX9-NEXT: s_mul_i32 s5, s9, s4 -; GFX9-NEXT: s_add_i32 s12, s12, s16 -; GFX9-NEXT: s_add_i32 s12, s12, s5 -; GFX9-NEXT: s_mul_i32 s17, s8, s4 -; GFX9-NEXT: s_mul_i32 s16, s4, s12 -; GFX9-NEXT: s_mul_hi_u32 s18, s4, s17 -; GFX9-NEXT: s_mul_hi_u32 s5, s4, s12 +; GFX9-NEXT: v_readfirstlane_b32 s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s15, v2 +; GFX9-NEXT: s_mul_hi_u32 s14, s4, s8 +; GFX9-NEXT: s_mul_i32 s16, s4, s15 +; GFX9-NEXT: s_mul_i32 s9, s5, s8 +; GFX9-NEXT: s_add_i32 s14, s14, s16 +; GFX9-NEXT: s_add_i32 s14, s14, s9 +; GFX9-NEXT: s_mul_i32 s17, s4, s8 +; GFX9-NEXT: s_mul_i32 s16, s8, s14 +; GFX9-NEXT: s_mul_hi_u32 s18, s8, s17 +; GFX9-NEXT: s_mul_hi_u32 s9, s8, s14 ; GFX9-NEXT: s_add_u32 s16, s18, s16 -; GFX9-NEXT: s_addc_u32 s5, 0, s5 -; GFX9-NEXT: s_mul_hi_u32 s19, s13, s17 -; GFX9-NEXT: s_mul_i32 s17, s13, s17 +; GFX9-NEXT: s_addc_u32 s9, 0, s9 +; GFX9-NEXT: s_mul_hi_u32 s19, s15, s17 +; GFX9-NEXT: s_mul_i32 s17, s15, s17 ; GFX9-NEXT: s_add_u32 s16, s16, s17 -; GFX9-NEXT: s_mul_hi_u32 s18, s13, s12 -; GFX9-NEXT: s_addc_u32 s5, s5, s19 +; GFX9-NEXT: s_mul_hi_u32 s18, s15, s14 +; GFX9-NEXT: s_addc_u32 s9, s9, s19 ; GFX9-NEXT: s_addc_u32 s16, s18, 0 -; GFX9-NEXT: s_mul_i32 s12, s13, s12 -; GFX9-NEXT: s_add_u32 s5, s5, s12 -; GFX9-NEXT: s_addc_u32 s12, 0, s16 -; GFX9-NEXT: s_add_u32 s16, s4, s5 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s12, s13, s12 -; GFX9-NEXT: s_mul_i32 s4, s8, s12 -; GFX9-NEXT: s_mul_hi_u32 s5, s8, s16 -; GFX9-NEXT: s_add_i32 s4, s5, s4 -; GFX9-NEXT: s_mul_i32 s9, s9, s16 -; GFX9-NEXT: s_add_i32 s4, s4, s9 -; GFX9-NEXT: s_mul_i32 s8, s8, s16 -; GFX9-NEXT: s_mul_hi_u32 s9, s12, s8 -; GFX9-NEXT: s_mul_i32 s13, s12, s8 -; GFX9-NEXT: s_mul_i32 s18, s16, s4 -; GFX9-NEXT: s_mul_hi_u32 s8, s16, s8 -; GFX9-NEXT: s_mul_hi_u32 s17, s16, s4 -; GFX9-NEXT: s_add_u32 s8, s8, s18 +; GFX9-NEXT: s_mul_i32 s14, s15, s14 +; GFX9-NEXT: s_add_u32 s9, s9, s14 +; GFX9-NEXT: s_addc_u32 s14, 0, s16 +; GFX9-NEXT: s_add_u32 s8, s8, s9 +; GFX9-NEXT: s_addc_u32 s9, s15, s14 +; GFX9-NEXT: s_mul_i32 s14, s4, s9 +; GFX9-NEXT: s_mul_hi_u32 s15, s4, s8 +; GFX9-NEXT: s_add_i32 s14, s15, s14 +; GFX9-NEXT: s_mul_i32 s5, s5, s8 +; GFX9-NEXT: s_add_i32 s14, s14, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s15, s9, s4 +; GFX9-NEXT: s_mul_i32 s16, s9, s4 +; GFX9-NEXT: s_mul_i32 s18, s8, s14 +; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 +; GFX9-NEXT: s_mul_hi_u32 s17, s8, s14 +; GFX9-NEXT: s_add_u32 s4, s4, s18 ; GFX9-NEXT: s_addc_u32 s17, 0, s17 -; GFX9-NEXT: s_add_u32 s8, s8, s13 -; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4 -; GFX9-NEXT: s_addc_u32 s8, s17, s9 +; GFX9-NEXT: s_add_u32 s4, s4, s16 +; GFX9-NEXT: s_mul_hi_u32 s5, s9, s14 +; GFX9-NEXT: s_addc_u32 s4, s17, s15 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_mul_i32 s4, s12, s4 -; GFX9-NEXT: s_add_u32 s4, s8, s4 -; GFX9-NEXT: s_addc_u32 s8, 0, s5 -; GFX9-NEXT: s_add_u32 s13, s16, s4 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s12, s12, s8 +; GFX9-NEXT: s_mul_i32 s14, s9, s14 +; GFX9-NEXT: s_add_u32 s4, s4, s14 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_add_u32 s14, s8, s4 +; GFX9-NEXT: s_addc_u32 s15, s9, s5 ; GFX9-NEXT: s_ashr_i32 s4, s11, 31 ; GFX9-NEXT: s_add_u32 s8, s10, s4 ; GFX9-NEXT: s_mov_b32 s5, s4 ; GFX9-NEXT: s_addc_u32 s9, s11, s4 ; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5] -; GFX9-NEXT: s_mul_i32 s11, s8, s12 -; GFX9-NEXT: s_mul_hi_u32 s16, s8, s13 -; GFX9-NEXT: s_mul_hi_u32 s10, s8, s12 +; GFX9-NEXT: s_mul_i32 s11, s8, s15 +; GFX9-NEXT: s_mul_hi_u32 s16, s8, s14 +; GFX9-NEXT: s_mul_hi_u32 s10, s8, s15 ; GFX9-NEXT: s_add_u32 s11, s16, s11 ; GFX9-NEXT: s_addc_u32 s10, 0, s10 -; GFX9-NEXT: s_mul_hi_u32 s17, s9, s13 -; GFX9-NEXT: s_mul_i32 s13, s9, s13 -; GFX9-NEXT: s_add_u32 s11, s11, s13 -; GFX9-NEXT: s_mul_hi_u32 s16, s9, s12 +; GFX9-NEXT: s_mul_hi_u32 s17, s9, s14 +; GFX9-NEXT: s_mul_i32 s14, s9, s14 +; GFX9-NEXT: s_add_u32 s11, s11, s14 +; GFX9-NEXT: s_mul_hi_u32 s16, s9, s15 ; GFX9-NEXT: s_addc_u32 s10, s10, s17 ; GFX9-NEXT: s_addc_u32 s11, s16, 0 -; GFX9-NEXT: s_mul_i32 s12, s9, s12 -; GFX9-NEXT: s_add_u32 s16, s10, s12 -; GFX9-NEXT: s_addc_u32 s17, 0, s11 -; GFX9-NEXT: s_mul_i32 s10, s6, s17 -; GFX9-NEXT: s_mul_hi_u32 s11, s6, s16 +; GFX9-NEXT: s_mul_i32 s14, s9, s15 +; GFX9-NEXT: s_add_u32 s14, s10, s14 +; GFX9-NEXT: s_addc_u32 s15, 0, s11 +; GFX9-NEXT: s_mul_i32 s10, s6, s15 +; GFX9-NEXT: s_mul_hi_u32 s11, s6, s14 ; GFX9-NEXT: s_add_i32 s10, s11, s10 -; GFX9-NEXT: s_mul_i32 s11, s7, s16 -; GFX9-NEXT: s_add_i32 s18, s10, s11 -; GFX9-NEXT: s_sub_i32 s12, s9, s18 -; GFX9-NEXT: s_mul_i32 s10, s6, s16 +; GFX9-NEXT: s_mul_i32 s11, s7, s14 +; GFX9-NEXT: s_add_i32 s16, s10, s11 +; GFX9-NEXT: s_sub_i32 s17, s9, s16 +; GFX9-NEXT: s_mul_i32 s10, s6, s14 ; GFX9-NEXT: s_sub_u32 s8, s8, s10 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s19, s12, s7 -; GFX9-NEXT: s_sub_u32 s20, s8, s6 -; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GFX9-NEXT: s_subb_u32 s12, s19, 0 -; GFX9-NEXT: s_cmp_ge_u32 s12, s7 -; GFX9-NEXT: s_cselect_b32 s13, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s20, s6 +; GFX9-NEXT: s_subb_u32 s17, s17, s7 +; GFX9-NEXT: s_sub_u32 s18, s8, s6 +; GFX9-NEXT: s_subb_u32 s17, s17, 0 +; GFX9-NEXT: s_cmp_ge_u32 s17, s7 ; GFX9-NEXT: s_cselect_b32 s19, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s12, s7 -; GFX9-NEXT: s_cselect_b32 s12, s19, s13 -; GFX9-NEXT: s_add_u32 s13, s16, 1 -; GFX9-NEXT: s_addc_u32 s19, s17, 0 -; GFX9-NEXT: s_add_u32 s20, s16, 2 -; GFX9-NEXT: s_addc_u32 s21, s17, 0 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 -; GFX9-NEXT: s_cselect_b32 s12, s20, s13 -; GFX9-NEXT: s_cselect_b32 s13, s21, s19 +; GFX9-NEXT: s_cmp_ge_u32 s18, s6 +; GFX9-NEXT: s_cselect_b32 s18, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s17, s7 +; GFX9-NEXT: s_cselect_b32 s17, s18, s19 +; GFX9-NEXT: s_add_u32 s18, s14, 1 +; GFX9-NEXT: s_addc_u32 s19, s15, 0 +; GFX9-NEXT: s_add_u32 s20, s14, 2 +; GFX9-NEXT: s_addc_u32 s21, s15, 0 +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cselect_b32 s17, s20, s18 +; GFX9-NEXT: s_cselect_b32 s18, s21, s19 ; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s9, s9, s18 +; GFX9-NEXT: s_subb_u32 s9, s9, s16 ; GFX9-NEXT: s_cmp_ge_u32 s9, s7 ; GFX9-NEXT: s_cselect_b32 s10, -1, 0 ; GFX9-NEXT: s_cmp_ge_u32 s8, s6 @@ -8861,14 +8828,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cmp_eq_u32 s9, s7 ; GFX9-NEXT: s_cselect_b32 s6, s6, s10 ; GFX9-NEXT: s_cmp_lg_u32 s6, 0 -; GFX9-NEXT: s_cselect_b32 s7, s13, s17 -; GFX9-NEXT: s_cselect_b32 s6, s12, s16 +; GFX9-NEXT: s_cselect_b32 s7, s18, s15 +; GFX9-NEXT: s_cselect_b32 s6, s17, s14 ; GFX9-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] ; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], s[2:3] ; GFX9-NEXT: s_sub_u32 s2, s4, s2 ; GFX9-NEXT: s_subb_u32 s3, s5, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -9089,10 +9056,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_addc_u32 s13, 0, s14 ; GFX6-NEXT: s_add_u32 s14, s0, s1 ; GFX6-NEXT: v_mov_b32_e32 v0, s14 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 +; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_addc_u32 s12, s12, s13 ; GFX6-NEXT: s_mul_i32 s0, s10, s12 ; GFX6-NEXT: v_readfirstlane_b32 s1, v0 @@ -9123,7 +9089,6 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_add_u32 s13, s14, s0 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_addc_u32 s12, s12, s10 ; GFX6-NEXT: s_ashr_i32 s10, s7, 31 ; GFX6-NEXT: s_add_u32 s0, s6, s10 @@ -9158,46 +9123,43 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: v_readfirstlane_b32 s5, v0 ; GFX6-NEXT: s_add_i32 s4, s5, s4 ; GFX6-NEXT: s_mul_i32 s5, s9, s12 -; GFX6-NEXT: s_add_i32 s13, s4, s5 -; GFX6-NEXT: s_sub_i32 s14, s7, s13 +; GFX6-NEXT: s_add_i32 s14, s4, s5 +; GFX6-NEXT: s_sub_i32 s13, s7, s14 ; GFX6-NEXT: s_mul_i32 s4, s8, s12 ; GFX6-NEXT: s_sub_u32 s6, s6, s4 ; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX6-NEXT: s_or_b32 s12, s4, s5 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_subb_u32 s14, s14, s9 -; GFX6-NEXT: s_sub_u32 s15, s6, s8 -; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX6-NEXT: s_subb_u32 s15, s13, s9 +; GFX6-NEXT: s_sub_u32 s16, s6, s8 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s17, s12, s13 +; GFX6-NEXT: s_subb_u32 s17, s15, 0 +; GFX6-NEXT: s_cmp_ge_u32 s17, s9 +; GFX6-NEXT: s_cselect_b32 s18, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s16, s8 +; GFX6-NEXT: s_cselect_b32 s19, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s17, s9 +; GFX6-NEXT: s_cselect_b32 s18, s19, s18 +; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_subb_u32 s15, s15, s9 +; GFX6-NEXT: s_sub_u32 s19, s16, s8 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_subb_u32 s12, s15, 0 +; GFX6-NEXT: s_cmp_lg_u32 s18, 0 +; GFX6-NEXT: s_cselect_b32 s13, s19, s16 +; GFX6-NEXT: s_cselect_b32 s12, s12, s17 ; GFX6-NEXT: s_or_b32 s4, s4, s5 -; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_subb_u32 s16, s14, 0 -; GFX6-NEXT: s_cmp_ge_u32 s16, s9 +; GFX6-NEXT: s_subb_u32 s4, s7, s14 +; GFX6-NEXT: s_cmp_ge_u32 s4, s9 ; GFX6-NEXT: s_cselect_b32 s5, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s15, s8 -; GFX6-NEXT: s_cselect_b32 s17, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s16, s9 -; GFX6-NEXT: s_cselect_b32 s17, s17, s5 -; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_subb_u32 s14, s14, s9 -; GFX6-NEXT: s_sub_u32 s18, s15, s8 -; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX6-NEXT: s_or_b32 s4, s4, s5 -; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_subb_u32 s4, s14, 0 -; GFX6-NEXT: s_cmp_lg_u32 s17, 0 -; GFX6-NEXT: s_cselect_b32 s14, s18, s15 -; GFX6-NEXT: s_cselect_b32 s4, s4, s16 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_subb_u32 s5, s7, s13 -; GFX6-NEXT: s_cmp_ge_u32 s5, s9 -; GFX6-NEXT: s_cselect_b32 s7, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s6, s8 -; GFX6-NEXT: s_cselect_b32 s8, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s5, s9 -; GFX6-NEXT: s_cselect_b32 s7, s8, s7 -; GFX6-NEXT: s_cmp_lg_u32 s7, 0 -; GFX6-NEXT: s_cselect_b32 s5, s4, s5 -; GFX6-NEXT: s_cselect_b32 s4, s14, s6 +; GFX6-NEXT: s_cselect_b32 s7, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s4, s9 +; GFX6-NEXT: s_cselect_b32 s5, s7, s5 +; GFX6-NEXT: s_cmp_lg_u32 s5, 0 +; GFX6-NEXT: s_cselect_b32 s5, s12, s4 +; GFX6-NEXT: s_cselect_b32 s4, s13, s6 ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11] ; GFX6-NEXT: s_sub_u32 s4, s4, s10 ; GFX6-NEXT: s_subb_u32 s5, s5, s10 @@ -9219,8 +9181,8 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_sub_u32 s8, 0, s6 -; GFX9-NEXT: s_subb_u32 s9, 0, s7 +; GFX9-NEXT: s_sub_u32 s4, 0, s6 +; GFX9-NEXT: s_subb_u32 s5, 0, s7 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -9230,56 +9192,52 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s10, v2 -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_mul_i32 s5, s8, s10 -; GFX9-NEXT: s_mul_hi_u32 s12, s8, s4 -; GFX9-NEXT: s_mul_i32 s11, s9, s4 -; GFX9-NEXT: s_add_i32 s5, s12, s5 -; GFX9-NEXT: s_mul_i32 s13, s8, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s11 -; GFX9-NEXT: s_mul_hi_u32 s12, s4, s13 -; GFX9-NEXT: s_mul_i32 s14, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s11, s4, s5 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v1 +; GFX9-NEXT: s_mul_i32 s10, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s12, s4, s9 +; GFX9-NEXT: s_mul_i32 s11, s5, s9 +; GFX9-NEXT: s_add_i32 s10, s12, s10 +; GFX9-NEXT: s_mul_i32 s13, s4, s9 +; GFX9-NEXT: s_add_i32 s10, s10, s11 +; GFX9-NEXT: s_mul_hi_u32 s12, s9, s13 +; GFX9-NEXT: s_mul_i32 s14, s9, s10 +; GFX9-NEXT: s_mul_hi_u32 s11, s9, s10 ; GFX9-NEXT: s_add_u32 s12, s12, s14 ; GFX9-NEXT: s_addc_u32 s11, 0, s11 -; GFX9-NEXT: s_mul_hi_u32 s15, s10, s13 -; GFX9-NEXT: s_mul_i32 s13, s10, s13 +; GFX9-NEXT: s_mul_hi_u32 s15, s8, s13 +; GFX9-NEXT: s_mul_i32 s13, s8, s13 ; GFX9-NEXT: s_add_u32 s12, s12, s13 -; GFX9-NEXT: s_mul_hi_u32 s14, s10, s5 +; GFX9-NEXT: s_mul_hi_u32 s14, s8, s10 ; GFX9-NEXT: s_addc_u32 s11, s11, s15 ; GFX9-NEXT: s_addc_u32 s12, s14, 0 -; GFX9-NEXT: s_mul_i32 s5, s10, s5 -; GFX9-NEXT: s_add_u32 s5, s11, s5 +; GFX9-NEXT: s_mul_i32 s10, s8, s10 +; GFX9-NEXT: s_add_u32 s10, s11, s10 ; GFX9-NEXT: s_addc_u32 s11, 0, s12 -; GFX9-NEXT: s_add_u32 s12, s4, s5 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s10, s10, s11 -; GFX9-NEXT: s_mul_i32 s4, s8, s10 -; GFX9-NEXT: s_mul_hi_u32 s5, s8, s12 -; GFX9-NEXT: s_add_i32 s4, s5, s4 -; GFX9-NEXT: s_mul_i32 s9, s9, s12 -; GFX9-NEXT: s_add_i32 s4, s4, s9 -; GFX9-NEXT: s_mul_i32 s8, s8, s12 -; GFX9-NEXT: s_mul_hi_u32 s9, s10, s8 -; GFX9-NEXT: s_mul_i32 s11, s10, s8 -; GFX9-NEXT: s_mul_i32 s14, s12, s4 -; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8 -; GFX9-NEXT: s_mul_hi_u32 s13, s12, s4 -; GFX9-NEXT: s_add_u32 s8, s8, s14 +; GFX9-NEXT: s_add_u32 s9, s9, s10 +; GFX9-NEXT: s_addc_u32 s8, s8, s11 +; GFX9-NEXT: s_mul_i32 s10, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s11, s4, s9 +; GFX9-NEXT: s_add_i32 s10, s11, s10 +; GFX9-NEXT: s_mul_i32 s5, s5, s9 +; GFX9-NEXT: s_add_i32 s10, s10, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s9 +; GFX9-NEXT: s_mul_hi_u32 s11, s8, s4 +; GFX9-NEXT: s_mul_i32 s12, s8, s4 +; GFX9-NEXT: s_mul_i32 s14, s9, s10 +; GFX9-NEXT: s_mul_hi_u32 s4, s9, s4 +; GFX9-NEXT: s_mul_hi_u32 s13, s9, s10 +; GFX9-NEXT: s_add_u32 s4, s4, s14 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_add_u32 s8, s8, s11 -; GFX9-NEXT: s_mul_hi_u32 s5, s10, s4 -; GFX9-NEXT: s_addc_u32 s8, s13, s9 +; GFX9-NEXT: s_add_u32 s4, s4, s12 +; GFX9-NEXT: s_mul_hi_u32 s5, s8, s10 +; GFX9-NEXT: s_addc_u32 s4, s13, s11 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_mul_i32 s4, s10, s4 -; GFX9-NEXT: s_add_u32 s4, s8, s4 -; GFX9-NEXT: s_addc_u32 s8, 0, s5 -; GFX9-NEXT: s_add_u32 s9, s12, s4 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s8, s10, s8 +; GFX9-NEXT: s_mul_i32 s10, s8, s10 +; GFX9-NEXT: s_add_u32 s4, s4, s10 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_add_u32 s9, s9, s4 +; GFX9-NEXT: s_addc_u32 s8, s8, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: s_add_u32 s2, s2, s4 @@ -9309,11 +9267,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_mul_i32 s8, s6, s8 ; GFX9-NEXT: s_sub_u32 s2, s2, s8 ; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX9-NEXT: s_subb_u32 s13, s10, s7 ; GFX9-NEXT: s_sub_u32 s14, s2, s6 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 ; GFX9-NEXT: s_subb_u32 s15, s13, 0 ; GFX9-NEXT: s_cmp_ge_u32 s15, s7 ; GFX9-NEXT: s_cselect_b32 s16, -1, 0 @@ -9322,13 +9278,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_cmp_eq_u32 s15, s7 ; GFX9-NEXT: s_cselect_b32 s16, s17, s16 ; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s13, s13, s7 -; GFX9-NEXT: s_sub_u32 s17, s14, s6 -; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s10, s13, 0 +; GFX9-NEXT: s_subb_u32 s10, s13, s7 +; GFX9-NEXT: s_sub_u32 s11, s14, s6 +; GFX9-NEXT: s_subb_u32 s10, s10, 0 ; GFX9-NEXT: s_cmp_lg_u32 s16, 0 -; GFX9-NEXT: s_cselect_b32 s11, s17, s14 +; GFX9-NEXT: s_cselect_b32 s11, s11, s14 ; GFX9-NEXT: s_cselect_b32 s10, s10, s15 ; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX9-NEXT: s_subb_u32 s3, s3, s12 @@ -9490,10 +9444,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_addc_u32 s15, 0, s16 ; GFX6-NEXT: s_add_u32 s16, s6, s7 ; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX6-NEXT: s_or_b32 s6, s6, s7 -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 ; GFX6-NEXT: s_addc_u32 s14, s14, s15 ; GFX6-NEXT: s_mul_i32 s6, s12, s14 ; GFX6-NEXT: v_readfirstlane_b32 s7, v0 @@ -9524,7 +9477,6 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_add_u32 s13, s16, s6 ; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX6-NEXT: s_or_b32 s6, s6, s7 -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 ; GFX6-NEXT: s_addc_u32 s12, s14, s12 ; GFX6-NEXT: s_ashr_i32 s6, s9, 31 ; GFX6-NEXT: s_add_u32 s8, s8, s6 @@ -9557,49 +9509,46 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_readfirstlane_b32 s14, v0 ; GFX6-NEXT: s_add_i32 s13, s14, s13 ; GFX6-NEXT: s_mul_i32 s14, s3, s12 -; GFX6-NEXT: s_add_i32 s14, s13, s14 -; GFX6-NEXT: s_sub_i32 s15, s9, s14 +; GFX6-NEXT: s_add_i32 s16, s13, s14 +; GFX6-NEXT: s_sub_i32 s14, s9, s16 ; GFX6-NEXT: s_mul_i32 s12, s2, s12 ; GFX6-NEXT: s_sub_u32 s8, s8, s12 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s16, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_subb_u32 s15, s15, s3 -; GFX6-NEXT: s_sub_u32 s17, s8, s2 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_subb_u32 s18, s15, 0 -; GFX6-NEXT: s_cmp_ge_u32 s18, s3 -; GFX6-NEXT: s_cselect_b32 s13, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s17, s2 -; GFX6-NEXT: s_cselect_b32 s19, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s18, s3 -; GFX6-NEXT: s_cselect_b32 s19, s19, s13 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_subb_u32 s15, s15, s3 -; GFX6-NEXT: s_sub_u32 s20, s17, s2 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s15, s12, s13 +; GFX6-NEXT: s_subb_u32 s17, s14, s3 +; GFX6-NEXT: s_sub_u32 s18, s8, s2 +; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX6-NEXT: s_or_b32 s19, s14, s15 +; GFX6-NEXT: s_subb_u32 s19, s17, 0 +; GFX6-NEXT: s_cmp_ge_u32 s19, s3 +; GFX6-NEXT: s_cselect_b32 s20, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s18, s2 +; GFX6-NEXT: s_cselect_b32 s21, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s19, s3 +; GFX6-NEXT: s_cselect_b32 s20, s21, s20 +; GFX6-NEXT: s_or_b32 s14, s14, s15 +; GFX6-NEXT: s_subb_u32 s17, s17, s3 +; GFX6-NEXT: s_sub_u32 s21, s18, s2 +; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX6-NEXT: s_or_b32 s14, s14, s15 +; GFX6-NEXT: s_subb_u32 s14, s17, 0 +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_cselect_b32 s15, s21, s18 +; GFX6-NEXT: s_cselect_b32 s14, s14, s19 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_subb_u32 s12, s15, 0 -; GFX6-NEXT: s_cmp_lg_u32 s19, 0 -; GFX6-NEXT: s_cselect_b32 s13, s20, s17 -; GFX6-NEXT: s_cselect_b32 s12, s12, s18 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_subb_u32 s9, s9, s14 +; GFX6-NEXT: s_subb_u32 s9, s9, s16 ; GFX6-NEXT: s_cmp_ge_u32 s9, s3 -; GFX6-NEXT: s_cselect_b32 s14, -1, 0 +; GFX6-NEXT: s_cselect_b32 s12, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s8, s2 ; GFX6-NEXT: s_cselect_b32 s2, -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s9, s3 -; GFX6-NEXT: s_cselect_b32 s2, s2, s14 +; GFX6-NEXT: s_cselect_b32 s2, s2, s12 ; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_cselect_b32 s3, s12, s9 -; GFX6-NEXT: s_cselect_b32 s2, s13, s8 +; GFX6-NEXT: s_cselect_b32 s3, s14, s9 +; GFX6-NEXT: s_cselect_b32 s2, s15, s8 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] -; GFX6-NEXT: s_sub_u32 s12, s2, s6 -; GFX6-NEXT: s_subb_u32 s13, s3, s6 +; GFX6-NEXT: s_sub_u32 s14, s2, s6 +; GFX6-NEXT: s_subb_u32 s15, s3, s6 ; GFX6-NEXT: s_ashr_i32 s2, s1, 31 ; GFX6-NEXT: s_add_u32 s0, s0, s2 ; GFX6-NEXT: s_mov_b32 s3, s2 @@ -9618,40 +9567,39 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: v_readfirstlane_b32 s12, v1 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s1, s8, s14 +; GFX6-NEXT: s_mul_i32 s1, s8, s12 ; GFX6-NEXT: v_readfirstlane_b32 s3, v2 ; GFX6-NEXT: s_mul_i32 s0, s9, s2 ; GFX6-NEXT: s_add_i32 s1, s3, s1 ; GFX6-NEXT: s_add_i32 s3, s1, s0 -; GFX6-NEXT: s_mul_i32 s15, s8, s2 +; GFX6-NEXT: s_mul_i32 s13, s8, s2 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mul_i32 s4, s2, s3 ; GFX6-NEXT: v_readfirstlane_b32 s5, v2 ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15 +; GFX6-NEXT: v_mul_hi_u32 v0, v1, s13 ; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3 ; GFX6-NEXT: s_add_u32 s4, s16, s4 ; GFX6-NEXT: s_addc_u32 s5, 0, s5 -; GFX6-NEXT: s_mul_i32 s15, s14, s15 +; GFX6-NEXT: s_mul_i32 s13, s12, s13 ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 -; GFX6-NEXT: s_add_u32 s4, s4, s15 +; GFX6-NEXT: s_add_u32 s4, s4, s13 ; GFX6-NEXT: s_addc_u32 s4, s5, s16 ; GFX6-NEXT: v_readfirstlane_b32 s5, v1 ; GFX6-NEXT: s_addc_u32 s5, s5, 0 -; GFX6-NEXT: s_mul_i32 s3, s14, s3 +; GFX6-NEXT: s_mul_i32 s3, s12, s3 ; GFX6-NEXT: s_add_u32 s3, s4, s3 ; GFX6-NEXT: s_addc_u32 s4, 0, s5 ; GFX6-NEXT: s_add_u32 s5, s2, s3 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_addc_u32 s4, s14, s4 +; GFX6-NEXT: s_addc_u32 s4, s12, s4 ; GFX6-NEXT: s_mul_i32 s2, s8, s4 ; GFX6-NEXT: v_readfirstlane_b32 s3, v0 ; GFX6-NEXT: s_add_i32 s2, s3, s2 @@ -9665,102 +9613,98 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: s_mul_i32 s9, s5, s2 -; GFX6-NEXT: v_readfirstlane_b32 s15, v2 -; GFX6-NEXT: s_add_u32 s9, s15, s9 -; GFX6-NEXT: v_readfirstlane_b32 s14, v0 +; GFX6-NEXT: v_readfirstlane_b32 s13, v2 +; GFX6-NEXT: s_add_u32 s9, s13, s9 +; GFX6-NEXT: v_readfirstlane_b32 s12, v0 ; GFX6-NEXT: s_mul_i32 s3, s4, s3 -; GFX6-NEXT: s_addc_u32 s14, 0, s14 +; GFX6-NEXT: s_addc_u32 s12, 0, s12 ; GFX6-NEXT: v_readfirstlane_b32 s8, v3 ; GFX6-NEXT: s_add_u32 s3, s9, s3 -; GFX6-NEXT: s_addc_u32 s3, s14, s8 +; GFX6-NEXT: s_addc_u32 s3, s12, s8 ; GFX6-NEXT: v_readfirstlane_b32 s8, v1 ; GFX6-NEXT: s_addc_u32 s8, s8, 0 ; GFX6-NEXT: s_mul_i32 s2, s4, s2 ; GFX6-NEXT: s_add_u32 s2, s3, s2 ; GFX6-NEXT: s_addc_u32 s8, 0, s8 -; GFX6-NEXT: s_add_u32 s14, s5, s2 +; GFX6-NEXT: s_add_u32 s12, s5, s2 ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_addc_u32 s15, s4, s8 +; GFX6-NEXT: s_addc_u32 s13, s4, s8 ; GFX6-NEXT: s_ashr_i32 s4, s11, 31 ; GFX6-NEXT: s_add_u32 s2, s10, s4 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_addc_u32 s3, s11, s4 ; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, s15 +; GFX6-NEXT: v_mov_b32_e32 v0, s13 ; GFX6-NEXT: v_mul_hi_u32 v1, s8, v0 -; GFX6-NEXT: v_mov_b32_e32 v2, s14 +; GFX6-NEXT: v_mov_b32_e32 v2, s12 ; GFX6-NEXT: v_mul_hi_u32 v3, s8, v2 -; GFX6-NEXT: s_mul_i32 s2, s8, s15 +; GFX6-NEXT: s_mul_i32 s2, s8, s13 ; GFX6-NEXT: v_readfirstlane_b32 s10, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s9, v2 ; GFX6-NEXT: v_readfirstlane_b32 s11, v3 ; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 ; GFX6-NEXT: s_add_u32 s2, s11, s2 ; GFX6-NEXT: s_addc_u32 s10, 0, s10 -; GFX6-NEXT: s_mul_i32 s11, s9, s14 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: s_mul_i32 s11, s9, s12 +; GFX6-NEXT: v_readfirstlane_b32 s12, v1 ; GFX6-NEXT: s_add_u32 s2, s2, s11 -; GFX6-NEXT: s_addc_u32 s2, s10, s14 +; GFX6-NEXT: s_addc_u32 s2, s10, s12 ; GFX6-NEXT: v_readfirstlane_b32 s10, v0 ; GFX6-NEXT: s_addc_u32 s10, s10, 0 -; GFX6-NEXT: s_mul_i32 s11, s9, s15 +; GFX6-NEXT: s_mul_i32 s11, s9, s13 ; GFX6-NEXT: s_add_u32 s11, s2, s11 ; GFX6-NEXT: v_mov_b32_e32 v0, s11 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX6-NEXT: s_addc_u32 s10, 0, s10 ; GFX6-NEXT: s_mul_i32 s10, s6, s10 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: v_readfirstlane_b32 s14, v0 -; GFX6-NEXT: s_add_i32 s10, s14, s10 -; GFX6-NEXT: s_mul_i32 s14, s7, s11 -; GFX6-NEXT: s_add_i32 s14, s10, s14 -; GFX6-NEXT: s_sub_i32 s15, s9, s14 +; GFX6-NEXT: v_readfirstlane_b32 s12, v0 +; GFX6-NEXT: s_add_i32 s10, s12, s10 +; GFX6-NEXT: s_mul_i32 s12, s7, s11 +; GFX6-NEXT: s_add_i32 s16, s10, s12 +; GFX6-NEXT: s_sub_i32 s12, s9, s16 ; GFX6-NEXT: s_mul_i32 s10, s6, s11 ; GFX6-NEXT: s_sub_u32 s8, s8, s10 ; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX6-NEXT: s_or_b32 s16, s10, s11 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_subb_u32 s15, s15, s7 -; GFX6-NEXT: s_sub_u32 s17, s8, s6 -; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX6-NEXT: s_or_b32 s10, s10, s11 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 -; GFX6-NEXT: s_subb_u32 s18, s15, 0 -; GFX6-NEXT: s_cmp_ge_u32 s18, s7 -; GFX6-NEXT: s_cselect_b32 s11, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s17, s6 -; GFX6-NEXT: s_cselect_b32 s19, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s18, s7 -; GFX6-NEXT: s_cselect_b32 s19, s19, s11 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 -; GFX6-NEXT: s_subb_u32 s15, s15, s7 -; GFX6-NEXT: s_sub_u32 s20, s17, s6 -; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX6-NEXT: s_or_b32 s13, s10, s11 +; GFX6-NEXT: s_subb_u32 s17, s12, s7 +; GFX6-NEXT: s_sub_u32 s18, s8, s6 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s19, s12, s13 +; GFX6-NEXT: s_subb_u32 s19, s17, 0 +; GFX6-NEXT: s_cmp_ge_u32 s19, s7 +; GFX6-NEXT: s_cselect_b32 s20, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s18, s6 +; GFX6-NEXT: s_cselect_b32 s21, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s19, s7 +; GFX6-NEXT: s_cselect_b32 s20, s21, s20 +; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_subb_u32 s17, s17, s7 +; GFX6-NEXT: s_sub_u32 s21, s18, s6 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_subb_u32 s12, s17, 0 +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_cselect_b32 s13, s21, s18 +; GFX6-NEXT: s_cselect_b32 s12, s12, s19 ; GFX6-NEXT: s_or_b32 s10, s10, s11 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 -; GFX6-NEXT: s_subb_u32 s10, s15, 0 -; GFX6-NEXT: s_cmp_lg_u32 s19, 0 -; GFX6-NEXT: s_cselect_b32 s11, s20, s17 -; GFX6-NEXT: s_cselect_b32 s10, s10, s18 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_subb_u32 s9, s9, s14 +; GFX6-NEXT: s_subb_u32 s9, s9, s16 ; GFX6-NEXT: s_cmp_ge_u32 s9, s7 -; GFX6-NEXT: s_cselect_b32 s14, -1, 0 +; GFX6-NEXT: s_cselect_b32 s10, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s8, s6 ; GFX6-NEXT: s_cselect_b32 s6, -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s9, s7 -; GFX6-NEXT: s_cselect_b32 s6, s6, s14 +; GFX6-NEXT: s_cselect_b32 s6, s6, s10 ; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_cselect_b32 s7, s10, s9 -; GFX6-NEXT: s_cselect_b32 s6, s11, s8 +; GFX6-NEXT: s_cselect_b32 s7, s12, s9 +; GFX6-NEXT: s_cselect_b32 s6, s13, s8 ; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GFX6-NEXT: s_sub_u32 s5, s6, s4 ; GFX6-NEXT: s_subb_u32 s4, s7, s4 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v0, s12 -; GFX6-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: v_mov_b32_e32 v1, s15 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: v_mov_b32_e32 v3, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9780,8 +9724,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX9-NEXT: s_sub_u32 s12, 0, s2 -; GFX9-NEXT: s_subb_u32 s13, 0, s3 +; GFX9-NEXT: s_sub_u32 s6, 0, s2 +; GFX9-NEXT: s_subb_u32 s7, 0, s3 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -9790,56 +9734,52 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s14, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s7, s12, s14 -; GFX9-NEXT: s_mul_hi_u32 s16, s12, s6 -; GFX9-NEXT: s_mul_i32 s15, s13, s6 -; GFX9-NEXT: s_add_i32 s7, s16, s7 -; GFX9-NEXT: s_mul_i32 s17, s12, s6 -; GFX9-NEXT: s_add_i32 s7, s7, s15 -; GFX9-NEXT: s_mul_hi_u32 s16, s6, s17 -; GFX9-NEXT: s_mul_i32 s18, s6, s7 -; GFX9-NEXT: s_mul_hi_u32 s15, s6, s7 +; GFX9-NEXT: v_readfirstlane_b32 s12, v1 +; GFX9-NEXT: v_readfirstlane_b32 s13, v0 +; GFX9-NEXT: s_mul_i32 s14, s6, s12 +; GFX9-NEXT: s_mul_hi_u32 s16, s6, s13 +; GFX9-NEXT: s_mul_i32 s15, s7, s13 +; GFX9-NEXT: s_add_i32 s14, s16, s14 +; GFX9-NEXT: s_mul_i32 s17, s6, s13 +; GFX9-NEXT: s_add_i32 s14, s14, s15 +; GFX9-NEXT: s_mul_hi_u32 s16, s13, s17 +; GFX9-NEXT: s_mul_i32 s18, s13, s14 +; GFX9-NEXT: s_mul_hi_u32 s15, s13, s14 ; GFX9-NEXT: s_add_u32 s16, s16, s18 ; GFX9-NEXT: s_addc_u32 s15, 0, s15 -; GFX9-NEXT: s_mul_hi_u32 s18, s14, s17 -; GFX9-NEXT: s_mul_i32 s17, s14, s17 +; GFX9-NEXT: s_mul_hi_u32 s18, s12, s17 +; GFX9-NEXT: s_mul_i32 s17, s12, s17 ; GFX9-NEXT: s_add_u32 s16, s16, s17 -; GFX9-NEXT: s_mul_hi_u32 s19, s14, s7 +; GFX9-NEXT: s_mul_hi_u32 s19, s12, s14 ; GFX9-NEXT: s_addc_u32 s15, s15, s18 ; GFX9-NEXT: s_addc_u32 s16, s19, 0 -; GFX9-NEXT: s_mul_i32 s7, s14, s7 -; GFX9-NEXT: s_add_u32 s7, s15, s7 +; GFX9-NEXT: s_mul_i32 s14, s12, s14 +; GFX9-NEXT: s_add_u32 s14, s15, s14 ; GFX9-NEXT: s_addc_u32 s15, 0, s16 -; GFX9-NEXT: s_add_u32 s16, s6, s7 -; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9-NEXT: s_addc_u32 s14, s14, s15 -; GFX9-NEXT: s_mul_i32 s6, s12, s14 -; GFX9-NEXT: s_mul_hi_u32 s7, s12, s16 -; GFX9-NEXT: s_add_i32 s6, s7, s6 -; GFX9-NEXT: s_mul_i32 s13, s13, s16 -; GFX9-NEXT: s_add_i32 s6, s6, s13 -; GFX9-NEXT: s_mul_i32 s12, s12, s16 -; GFX9-NEXT: s_mul_hi_u32 s13, s14, s12 -; GFX9-NEXT: s_mul_i32 s15, s14, s12 -; GFX9-NEXT: s_mul_i32 s18, s16, s6 -; GFX9-NEXT: s_mul_hi_u32 s12, s16, s12 -; GFX9-NEXT: s_mul_hi_u32 s17, s16, s6 -; GFX9-NEXT: s_add_u32 s12, s12, s18 +; GFX9-NEXT: s_add_u32 s13, s13, s14 +; GFX9-NEXT: s_addc_u32 s12, s12, s15 +; GFX9-NEXT: s_mul_i32 s14, s6, s12 +; GFX9-NEXT: s_mul_hi_u32 s15, s6, s13 +; GFX9-NEXT: s_add_i32 s14, s15, s14 +; GFX9-NEXT: s_mul_i32 s7, s7, s13 +; GFX9-NEXT: s_add_i32 s14, s14, s7 +; GFX9-NEXT: s_mul_i32 s6, s6, s13 +; GFX9-NEXT: s_mul_hi_u32 s15, s12, s6 +; GFX9-NEXT: s_mul_i32 s16, s12, s6 +; GFX9-NEXT: s_mul_i32 s18, s13, s14 +; GFX9-NEXT: s_mul_hi_u32 s6, s13, s6 +; GFX9-NEXT: s_mul_hi_u32 s17, s13, s14 +; GFX9-NEXT: s_add_u32 s6, s6, s18 ; GFX9-NEXT: s_addc_u32 s17, 0, s17 -; GFX9-NEXT: s_add_u32 s12, s12, s15 -; GFX9-NEXT: s_mul_hi_u32 s7, s14, s6 -; GFX9-NEXT: s_addc_u32 s12, s17, s13 +; GFX9-NEXT: s_add_u32 s6, s6, s16 +; GFX9-NEXT: s_mul_hi_u32 s7, s12, s14 +; GFX9-NEXT: s_addc_u32 s6, s17, s15 ; GFX9-NEXT: s_addc_u32 s7, s7, 0 -; GFX9-NEXT: s_mul_i32 s6, s14, s6 -; GFX9-NEXT: s_add_u32 s6, s12, s6 -; GFX9-NEXT: s_addc_u32 s12, 0, s7 -; GFX9-NEXT: s_add_u32 s13, s16, s6 -; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9-NEXT: s_addc_u32 s12, s14, s12 +; GFX9-NEXT: s_mul_i32 s14, s12, s14 +; GFX9-NEXT: s_add_u32 s6, s6, s14 +; GFX9-NEXT: s_addc_u32 s7, 0, s7 +; GFX9-NEXT: s_add_u32 s13, s13, s6 +; GFX9-NEXT: s_addc_u32 s12, s12, s7 ; GFX9-NEXT: s_ashr_i32 s6, s9, 31 ; GFX9-NEXT: s_add_u32 s8, s8, s6 ; GFX9-NEXT: s_mov_b32 s7, s6 @@ -9868,11 +9808,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_i32 s12, s2, s12 ; GFX9-NEXT: s_sub_u32 s8, s8, s12 ; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 ; GFX9-NEXT: s_subb_u32 s17, s14, s3 ; GFX9-NEXT: s_sub_u32 s18, s8, s2 ; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 ; GFX9-NEXT: s_subb_u32 s19, s17, 0 ; GFX9-NEXT: s_cmp_ge_u32 s19, s3 ; GFX9-NEXT: s_cselect_b32 s20, -1, 0 @@ -9881,13 +9819,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cmp_eq_u32 s19, s3 ; GFX9-NEXT: s_cselect_b32 s20, s21, s20 ; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GFX9-NEXT: s_subb_u32 s17, s17, s3 -; GFX9-NEXT: s_sub_u32 s21, s18, s2 -; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GFX9-NEXT: s_subb_u32 s14, s17, 0 +; GFX9-NEXT: s_subb_u32 s14, s17, s3 +; GFX9-NEXT: s_sub_u32 s15, s18, s2 +; GFX9-NEXT: s_subb_u32 s14, s14, 0 ; GFX9-NEXT: s_cmp_lg_u32 s20, 0 -; GFX9-NEXT: s_cselect_b32 s15, s21, s18 +; GFX9-NEXT: s_cselect_b32 s15, s15, s18 ; GFX9-NEXT: s_cselect_b32 s14, s14, s19 ; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 ; GFX9-NEXT: s_subb_u32 s9, s9, s16 @@ -9911,8 +9847,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_sub_u32 s6, 0, s2 -; GFX9-NEXT: s_subb_u32 s7, 0, s3 +; GFX9-NEXT: s_sub_u32 s4, 0, s2 +; GFX9-NEXT: s_subb_u32 s5, 0, s3 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -9922,74 +9858,70 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v1 ; GFX9-NEXT: v_readfirstlane_b32 s9, v2 -; GFX9-NEXT: s_mul_hi_u32 s8, s6, s4 -; GFX9-NEXT: s_mul_i32 s14, s6, s9 -; GFX9-NEXT: s_mul_i32 s5, s7, s4 +; GFX9-NEXT: s_mul_hi_u32 s8, s4, s6 +; GFX9-NEXT: s_mul_i32 s14, s4, s9 +; GFX9-NEXT: s_mul_i32 s7, s5, s6 ; GFX9-NEXT: s_add_i32 s8, s8, s14 -; GFX9-NEXT: s_add_i32 s8, s8, s5 -; GFX9-NEXT: s_mul_i32 s15, s6, s4 -; GFX9-NEXT: s_mul_i32 s14, s4, s8 -; GFX9-NEXT: s_mul_hi_u32 s16, s4, s15 -; GFX9-NEXT: s_mul_hi_u32 s5, s4, s8 +; GFX9-NEXT: s_add_i32 s8, s8, s7 +; GFX9-NEXT: s_mul_i32 s15, s4, s6 +; GFX9-NEXT: s_mul_i32 s14, s6, s8 +; GFX9-NEXT: s_mul_hi_u32 s16, s6, s15 +; GFX9-NEXT: s_mul_hi_u32 s7, s6, s8 ; GFX9-NEXT: s_add_u32 s14, s16, s14 -; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_addc_u32 s7, 0, s7 ; GFX9-NEXT: s_mul_hi_u32 s17, s9, s15 ; GFX9-NEXT: s_mul_i32 s15, s9, s15 ; GFX9-NEXT: s_add_u32 s14, s14, s15 ; GFX9-NEXT: s_mul_hi_u32 s16, s9, s8 -; GFX9-NEXT: s_addc_u32 s5, s5, s17 +; GFX9-NEXT: s_addc_u32 s7, s7, s17 ; GFX9-NEXT: s_addc_u32 s14, s16, 0 ; GFX9-NEXT: s_mul_i32 s8, s9, s8 -; GFX9-NEXT: s_add_u32 s5, s5, s8 +; GFX9-NEXT: s_add_u32 s7, s7, s8 ; GFX9-NEXT: s_addc_u32 s8, 0, s14 -; GFX9-NEXT: s_add_u32 s14, s4, s5 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s8, s9, s8 -; GFX9-NEXT: s_mul_i32 s4, s6, s8 -; GFX9-NEXT: s_mul_hi_u32 s5, s6, s14 -; GFX9-NEXT: s_add_i32 s4, s5, s4 -; GFX9-NEXT: s_mul_i32 s7, s7, s14 -; GFX9-NEXT: s_add_i32 s4, s4, s7 -; GFX9-NEXT: s_mul_i32 s6, s6, s14 -; GFX9-NEXT: s_mul_hi_u32 s7, s8, s6 -; GFX9-NEXT: s_mul_i32 s9, s8, s6 -; GFX9-NEXT: s_mul_i32 s16, s14, s4 -; GFX9-NEXT: s_mul_hi_u32 s6, s14, s6 -; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4 -; GFX9-NEXT: s_add_u32 s6, s6, s16 +; GFX9-NEXT: s_add_u32 s6, s6, s7 +; GFX9-NEXT: s_addc_u32 s7, s9, s8 +; GFX9-NEXT: s_mul_i32 s8, s4, s7 +; GFX9-NEXT: s_mul_hi_u32 s9, s4, s6 +; GFX9-NEXT: s_add_i32 s8, s9, s8 +; GFX9-NEXT: s_mul_i32 s5, s5, s6 +; GFX9-NEXT: s_add_i32 s8, s8, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s6 +; GFX9-NEXT: s_mul_hi_u32 s9, s7, s4 +; GFX9-NEXT: s_mul_i32 s14, s7, s4 +; GFX9-NEXT: s_mul_i32 s16, s6, s8 +; GFX9-NEXT: s_mul_hi_u32 s4, s6, s4 +; GFX9-NEXT: s_mul_hi_u32 s15, s6, s8 +; GFX9-NEXT: s_add_u32 s4, s4, s16 ; GFX9-NEXT: s_addc_u32 s15, 0, s15 -; GFX9-NEXT: s_add_u32 s6, s6, s9 -; GFX9-NEXT: s_mul_hi_u32 s5, s8, s4 -; GFX9-NEXT: s_addc_u32 s6, s15, s7 +; GFX9-NEXT: s_add_u32 s4, s4, s14 +; GFX9-NEXT: s_mul_hi_u32 s5, s7, s8 +; GFX9-NEXT: s_addc_u32 s4, s15, s9 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_mul_i32 s4, s8, s4 -; GFX9-NEXT: s_add_u32 s4, s6, s4 -; GFX9-NEXT: s_addc_u32 s6, 0, s5 -; GFX9-NEXT: s_add_u32 s9, s14, s4 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s8, s8, s6 +; GFX9-NEXT: s_mul_i32 s8, s7, s8 +; GFX9-NEXT: s_add_u32 s4, s4, s8 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_add_u32 s8, s6, s4 +; GFX9-NEXT: s_addc_u32 s9, s7, s5 ; GFX9-NEXT: s_ashr_i32 s4, s11, 31 ; GFX9-NEXT: s_add_u32 s6, s10, s4 ; GFX9-NEXT: s_mov_b32 s5, s4 ; GFX9-NEXT: s_addc_u32 s7, s11, s4 ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] -; GFX9-NEXT: s_mul_i32 s11, s6, s8 -; GFX9-NEXT: s_mul_hi_u32 s14, s6, s9 -; GFX9-NEXT: s_mul_hi_u32 s10, s6, s8 +; GFX9-NEXT: s_mul_i32 s11, s6, s9 +; GFX9-NEXT: s_mul_hi_u32 s14, s6, s8 +; GFX9-NEXT: s_mul_hi_u32 s10, s6, s9 ; GFX9-NEXT: s_add_u32 s11, s14, s11 ; GFX9-NEXT: s_addc_u32 s10, 0, s10 -; GFX9-NEXT: s_mul_hi_u32 s15, s7, s9 -; GFX9-NEXT: s_mul_i32 s9, s7, s9 -; GFX9-NEXT: s_add_u32 s9, s11, s9 -; GFX9-NEXT: s_mul_hi_u32 s14, s7, s8 -; GFX9-NEXT: s_addc_u32 s9, s10, s15 -; GFX9-NEXT: s_addc_u32 s10, s14, 0 +; GFX9-NEXT: s_mul_hi_u32 s15, s7, s8 ; GFX9-NEXT: s_mul_i32 s8, s7, s8 -; GFX9-NEXT: s_add_u32 s8, s9, s8 +; GFX9-NEXT: s_add_u32 s8, s11, s8 +; GFX9-NEXT: s_mul_hi_u32 s14, s7, s9 +; GFX9-NEXT: s_addc_u32 s8, s10, s15 +; GFX9-NEXT: s_addc_u32 s10, s14, 0 +; GFX9-NEXT: s_mul_i32 s9, s7, s9 +; GFX9-NEXT: s_add_u32 s8, s8, s9 ; GFX9-NEXT: s_addc_u32 s9, 0, s10 ; GFX9-NEXT: s_mul_i32 s9, s2, s9 ; GFX9-NEXT: s_mul_hi_u32 s10, s2, s8 @@ -10000,11 +9932,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_i32 s8, s2, s8 ; GFX9-NEXT: s_sub_u32 s6, s6, s8 ; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX9-NEXT: s_subb_u32 s15, s10, s3 ; GFX9-NEXT: s_sub_u32 s16, s6, s2 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 ; GFX9-NEXT: s_subb_u32 s17, s15, 0 ; GFX9-NEXT: s_cmp_ge_u32 s17, s3 ; GFX9-NEXT: s_cselect_b32 s18, -1, 0 @@ -10013,13 +9943,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cmp_eq_u32 s17, s3 ; GFX9-NEXT: s_cselect_b32 s18, s19, s18 ; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s15, s15, s3 -; GFX9-NEXT: s_sub_u32 s19, s16, s2 -; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s10, s15, 0 +; GFX9-NEXT: s_subb_u32 s10, s15, s3 +; GFX9-NEXT: s_sub_u32 s11, s16, s2 +; GFX9-NEXT: s_subb_u32 s10, s10, 0 ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cselect_b32 s11, s19, s16 +; GFX9-NEXT: s_cselect_b32 s11, s11, s16 ; GFX9-NEXT: s_cselect_b32 s10, s10, s17 ; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX9-NEXT: s_subb_u32 s7, s7, s14 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index 394727c..01f4414 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -612,12 +612,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s8 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -653,12 +652,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s8 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -693,11 +691,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: s_add_i32 s2, s2, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -733,11 +730,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -774,11 +770,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -818,11 +813,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -859,11 +853,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -901,15 +894,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -999,12 +992,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s8 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1042,12 +1034,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s8 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1084,11 +1075,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: s_add_i32 s2, s2, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1127,11 +1117,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1171,11 +1160,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1218,11 +1206,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1261,11 +1248,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1306,15 +1292,15 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB3_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2073,12 +2059,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s8 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2114,12 +2099,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s8 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2154,11 +2138,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: s_add_i32 s2, s2, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2194,11 +2177,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2235,11 +2217,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2279,11 +2260,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2321,11 +2301,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2363,15 +2342,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 258bc295..9db6d70 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -717,12 +717,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 -; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 +; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s3 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -762,12 +761,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 +; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s3 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -805,13 +803,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[0:1] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -853,11 +850,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s3 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s6, s6, s2 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -897,14 +893,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1164_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -949,11 +944,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s6, s6, s2 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -993,14 +987,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1264_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s7 ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 -; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 ; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8 -; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1028,6 +1022,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xf1ff ; GFX1264_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null @@ -1041,15 +1036,15 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1232_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 -; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 -; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -2363,7 +2358,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -2416,7 +2410,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -2462,13 +2455,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s2 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2 ; GFX1064_ITERATIVE-NEXT: s_add_u32 s6, s6, s3 ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -2515,13 +2507,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1 ; GFX1032_ITERATIVE-NEXT: s_add_u32 s6, s6, s2 ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -2569,14 +2560,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s2 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s6, s6, s3 ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -2626,14 +2616,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s6, s6, s2 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -2677,16 +2666,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1264_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s10 -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s10 -; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s10 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s10 ; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9] ; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] -; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -2731,17 +2720,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 -; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 -; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8 -; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -4490,12 +4479,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0 -; GFX8_ITERATIVE-NEXT: s_add_i32 s12, s12, s6 +; GFX8_ITERATIVE-NEXT: s_add_i32 s12, s12, s3 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -4550,12 +4538,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s12, s12, s6 +; GFX9_ITERATIVE-NEXT: s_add_i32 s12, s12, s3 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -4608,13 +4595,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s6 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s2 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1064_ITERATIVE-NEXT: s_add_i32 s12, s12, s6 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064_ITERATIVE-NEXT: s_add_i32 s12, s12, s7 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -4670,11 +4656,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s8, s1 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s3 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s8, s8, s2 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -4728,14 +4713,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2 ; GFX1164_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s6 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s2 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164_ITERATIVE-NEXT: s_add_i32 s12, s12, s6 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164_ITERATIVE-NEXT: s_add_i32 s12, s12, s7 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -4799,11 +4783,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s8, s1 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s8, s8, s2 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -4861,14 +4844,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1264_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s7 ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 -; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 ; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8 -; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -4896,6 +4879,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xf1ff ; GFX1264_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null @@ -4909,15 +4893,15 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1232_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 -; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 -; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -6673,7 +6657,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -6746,7 +6729,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -6812,13 +6794,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2 ; GFX1064_ITERATIVE-NEXT: s_add_u32 s8, s8, s3 ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s9, s9, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -6883,13 +6864,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s1 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1 ; GFX1032_ITERATIVE-NEXT: s_add_u32 s8, s8, s2 ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -6955,14 +6935,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s8, s8, s3 ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s9, s9, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -7036,14 +7015,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s1 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s8, s8, s2 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -7109,16 +7087,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1264_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s10 -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s10 -; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s10 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s10 ; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9] ; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] -; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -7163,17 +7141,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 -; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 -; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8 -; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 23c5f4f..6167a84 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -499,12 +499,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -540,12 +539,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -580,11 +578,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -621,11 +618,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -663,11 +659,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -707,11 +702,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1088,11 +1082,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX8_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s4 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1117,11 +1110,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX9_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s4 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1147,9 +1139,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1176,9 +1167,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1206,10 +1196,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] ; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1239,10 +1227,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2022,7 +2008,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2071,7 +2056,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2112,13 +2096,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2160,13 +2143,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2209,14 +2191,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -2261,14 +2242,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -2881,7 +2861,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2914,7 +2893,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2946,7 +2924,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2979,7 +2956,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3013,8 +2989,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3048,9 +3022,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s4 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3906,12 +3879,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3947,12 +3919,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3987,11 +3958,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4028,11 +3998,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4070,11 +4039,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -4114,11 +4082,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -4495,11 +4462,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX8_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s4 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4524,11 +4490,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX9_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s4 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4554,9 +4519,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4583,9 +4547,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4613,10 +4576,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] ; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4646,10 +4607,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5452,7 +5411,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5501,7 +5459,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5542,13 +5499,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5590,13 +5546,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5639,14 +5594,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -5691,14 +5645,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -6313,12 +6266,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_and_b32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6354,12 +6306,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_and_b32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6394,11 +6345,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6435,11 +6385,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_and_b32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6477,11 +6426,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -6521,11 +6469,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_and_b32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -6926,12 +6873,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX8_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6973,12 +6919,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX9_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7015,15 +6960,14 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1064_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7065,12 +7009,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7109,16 +7052,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1164_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -7163,12 +7105,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -7672,12 +7613,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_or_b32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7713,12 +7653,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_or_b32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7753,11 +7692,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7794,11 +7732,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_or_b32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7836,11 +7773,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -7880,11 +7816,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_or_b32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -8284,12 +8219,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX8_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8331,12 +8265,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX9_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8373,15 +8306,14 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1064_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8423,12 +8355,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8467,16 +8398,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1164_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8521,12 +8451,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9030,12 +8959,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_xor_b32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9071,12 +8999,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_xor_b32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9111,11 +9038,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9152,11 +9078,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9194,11 +9119,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -9238,11 +9162,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -9642,12 +9565,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9689,12 +9611,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9731,15 +9652,14 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9781,12 +9701,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9825,16 +9744,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9879,12 +9797,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -10388,12 +10305,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_max_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10429,12 +10345,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_max_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10469,11 +10384,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10510,11 +10424,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_max_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10552,11 +10465,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -10596,11 +10508,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_max_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -11255,7 +11166,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11311,7 +11221,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11363,7 +11272,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11415,7 +11323,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11468,9 +11375,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -11525,9 +11431,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -12214,12 +12119,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_min_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12255,12 +12159,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_min_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12295,11 +12198,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12336,11 +12238,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_min_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12378,11 +12279,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -12422,11 +12322,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_min_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -13081,7 +12980,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -13137,7 +13035,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -13189,7 +13086,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -13241,7 +13137,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -13294,9 +13189,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -13351,9 +13245,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -14040,12 +13933,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_max_u32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14081,12 +13973,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_max_u32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14121,11 +14012,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14162,11 +14052,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_max_u32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14204,11 +14093,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -14248,11 +14136,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_max_u32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -14901,7 +14788,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14956,7 +14842,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15007,7 +14892,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15058,7 +14942,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15112,8 +14995,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -15169,8 +15050,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -15853,12 +15732,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_min_u32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15894,12 +15772,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_min_u32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15934,11 +15811,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15975,11 +15851,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_min_u32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16017,11 +15892,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -16061,11 +15935,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_min_u32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -16715,7 +16588,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16770,7 +16642,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16821,7 +16692,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16872,7 +16742,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16926,8 +16795,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -16983,8 +16850,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index e4def28..9afc0c6 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -611,12 +611,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s8 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -652,12 +651,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s8 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -692,11 +690,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: s_add_i32 s2, s2, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -732,11 +729,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -773,11 +769,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -817,11 +812,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -858,11 +852,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -900,15 +893,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1665,12 +1658,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s8 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1706,12 +1698,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s8 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1746,11 +1737,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: s_add_i32 s2, s2, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1786,11 +1776,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1827,11 +1816,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1871,11 +1859,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1913,11 +1900,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1955,15 +1941,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB6_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index 39a3c9a..10fd34f 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -628,12 +628,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s8 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -670,12 +669,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s8 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -711,11 +709,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: s_add_i32 s2, s2, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -752,11 +749,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -794,11 +790,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -839,11 +834,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -880,11 +874,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -923,15 +916,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1833,12 +1826,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s8 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1875,12 +1867,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s8 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1916,11 +1907,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: s_add_i32 s2, s2, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1957,11 +1947,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1999,11 +1988,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2044,11 +2032,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2086,11 +2073,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2129,15 +2115,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index 4a6fa4f..b96de17 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -704,7 +704,6 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; CISI-NEXT: s_add_u32 s4, s4, s6 ; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0 ; CISI-NEXT: s_or_b32 s6, s12, s13 -; CISI-NEXT: s_cmp_lg_u32 s6, 0 ; CISI-NEXT: s_addc_u32 s5, s5, s7 ; CISI-NEXT: s_mov_b32 s8, s0 ; CISI-NEXT: s_mov_b32 s9, s1 @@ -725,16 +724,14 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: s_add_u32 s2, s4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_add_u32 s0, s4, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_addc_u32 s1, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 -; VI-NEXT: s_addc_u32 s0, s5, s7 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_mov_b32_e32 v5, s0 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -746,12 +743,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s2, s12, s14 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_addc_u32 s0, s13, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s12, s14 +; GFX9-NEXT: s_addc_u32 s1, s13, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -764,10 +759,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_add_u32 s0, s12, s14 -; GFX1010-NEXT: s_cselect_b32 s1, -1, 0 -; GFX1010-NEXT: v_mov_b32_e32 v0, s0 -; GFX1010-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1010-NEXT: s_addc_u32 s1, s13, s15 +; GFX1010-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -781,10 +774,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_add_u32 s4, s4, s6 -; GFX1030W32-NEXT: s_cselect_b32 s6, -1, 0 -; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030W32-NEXT: s_cmp_lg_u32 s6, 0 ; GFX1030W32-NEXT: s_addc_u32 s5, s5, s7 +; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 @@ -798,10 +789,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_add_u32 s4, s4, s6 -; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX1030W64-NEXT: s_addc_u32 s5, s5, s7 +; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5 ; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] @@ -814,10 +803,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s4, s4, s6 -; GFX11-NEXT: s_cselect_b32 s6, -1, 0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_cmp_lg_u32 s6, 0 ; GFX11-NEXT: s_addc_u32 s5, s5, s7 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 @@ -831,10 +818,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_add_co_u32 s0, s12, s14 -; GFX1250-NEXT: s_cselect_b32 s1, -1, 0 -; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0 -; GFX1250-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1250-NEXT: s_add_co_ci_u32 s1, s13, s15 +; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0 ; GFX1250-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -1691,7 +1676,6 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; CISI-NEXT: s_sub_u32 s4, s4, s6 ; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0 ; CISI-NEXT: s_or_b32 s6, s12, s13 -; CISI-NEXT: s_cmp_lg_u32 s6, 0 ; CISI-NEXT: s_subb_u32 s5, s5, s7 ; CISI-NEXT: s_mov_b32 s8, s0 ; CISI-NEXT: s_mov_b32 s9, s1 @@ -1712,16 +1696,14 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: s_sub_u32 s2, s4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_sub_u32 s0, s4, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_subb_u32 s1, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 -; VI-NEXT: s_subb_u32 s0, s5, s7 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_mov_b32_e32 v5, s0 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -1733,12 +1715,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s2, s12, s14 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_subb_u32 s0, s13, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_sub_u32 s0, s12, s14 +; GFX9-NEXT: s_subb_u32 s1, s13, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -1751,10 +1731,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_sub_u32 s0, s12, s14 -; GFX1010-NEXT: s_cselect_b32 s1, -1, 0 -; GFX1010-NEXT: v_mov_b32_e32 v0, s0 -; GFX1010-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1010-NEXT: s_subb_u32 s1, s13, s15 +; GFX1010-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -1768,10 +1746,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_sub_u32 s4, s4, s6 -; GFX1030W32-NEXT: s_cselect_b32 s6, -1, 0 -; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030W32-NEXT: s_cmp_lg_u32 s6, 0 ; GFX1030W32-NEXT: s_subb_u32 s5, s5, s7 +; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 @@ -1785,10 +1761,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_sub_u32 s4, s4, s6 -; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX1030W64-NEXT: s_subb_u32 s5, s5, s7 +; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5 ; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] @@ -1801,10 +1775,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s4, s4, s6 -; GFX11-NEXT: s_cselect_b32 s6, -1, 0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_cmp_lg_u32 s6, 0 ; GFX11-NEXT: s_subb_u32 s5, s5, s7 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 @@ -1818,10 +1790,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_sub_co_u32 s0, s12, s14 -; GFX1250-NEXT: s_cselect_b32 s1, -1, 0 -; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0 -; GFX1250-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1250-NEXT: s_sub_co_ci_u32 s1, s13, s15 +; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0 ; GFX1250-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -2218,49 +2188,46 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; VI-NEXT: s_addc_u32 s6, s7, s9 ; VI-NEXT: s_addc_u32 s8, s8, 0 ; VI-NEXT: v_readfirstlane_b32 s7, v0 -; VI-NEXT: s_add_u32 s12, s6, s7 -; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: s_add_u32 s10, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s10 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s4, v0, 0 -; VI-NEXT: s_addc_u32 s13, 0, s8 -; VI-NEXT: s_mul_i32 s8, s4, s13 +; VI-NEXT: s_addc_u32 s11, 0, s8 +; VI-NEXT: s_mul_i32 s8, s4, s11 ; VI-NEXT: v_readfirstlane_b32 s9, v1 ; VI-NEXT: s_add_i32 s8, s9, s8 -; VI-NEXT: s_mul_i32 s9, s5, s12 -; VI-NEXT: s_add_i32 s14, s8, s9 -; VI-NEXT: s_sub_i32 s10, s3, s14 +; VI-NEXT: s_mul_i32 s9, s5, s10 +; VI-NEXT: s_add_i32 s12, s8, s9 +; VI-NEXT: s_sub_i32 s13, s3, s12 ; VI-NEXT: v_readfirstlane_b32 s8, v0 -; VI-NEXT: s_sub_u32 s15, s2, s8 +; VI-NEXT: s_sub_u32 s14, s2, s8 ; VI-NEXT: s_cselect_b64 s[8:9], -1, 0 -; VI-NEXT: s_cmp_lg_u64 s[8:9], 0 -; VI-NEXT: s_subb_u32 s16, s10, s5 -; VI-NEXT: s_sub_u32 s17, s15, s4 -; VI-NEXT: s_cselect_b64 s[10:11], -1, 0 -; VI-NEXT: s_cmp_lg_u64 s[10:11], 0 -; VI-NEXT: s_subb_u32 s10, s16, 0 -; VI-NEXT: s_cmp_ge_u32 s10, s5 -; VI-NEXT: s_cselect_b32 s11, -1, 0 -; VI-NEXT: s_cmp_ge_u32 s17, s4 +; VI-NEXT: s_subb_u32 s13, s13, s5 +; VI-NEXT: s_sub_u32 s15, s14, s4 +; VI-NEXT: s_subb_u32 s13, s13, 0 +; VI-NEXT: s_cmp_ge_u32 s13, s5 ; VI-NEXT: s_cselect_b32 s16, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s10, s5 -; VI-NEXT: s_cselect_b32 s10, s16, s11 -; VI-NEXT: s_add_u32 s11, s12, 1 -; VI-NEXT: s_addc_u32 s16, s13, 0 -; VI-NEXT: s_add_u32 s17, s12, 2 -; VI-NEXT: s_addc_u32 s18, s13, 0 -; VI-NEXT: s_cmp_lg_u32 s10, 0 -; VI-NEXT: s_cselect_b32 s10, s17, s11 -; VI-NEXT: s_cselect_b32 s11, s18, s16 +; VI-NEXT: s_cmp_ge_u32 s15, s4 +; VI-NEXT: s_cselect_b32 s15, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s13, s5 +; VI-NEXT: s_cselect_b32 s13, s15, s16 +; VI-NEXT: s_add_u32 s15, s10, 1 +; VI-NEXT: s_addc_u32 s16, s11, 0 +; VI-NEXT: s_add_u32 s17, s10, 2 +; VI-NEXT: s_addc_u32 s18, s11, 0 +; VI-NEXT: s_cmp_lg_u32 s13, 0 +; VI-NEXT: s_cselect_b32 s13, s17, s15 +; VI-NEXT: s_cselect_b32 s15, s18, s16 ; VI-NEXT: s_cmp_lg_u64 s[8:9], 0 -; VI-NEXT: s_subb_u32 s3, s3, s14 +; VI-NEXT: s_subb_u32 s3, s3, s12 ; VI-NEXT: s_cmp_ge_u32 s3, s5 ; VI-NEXT: s_cselect_b32 s8, -1, 0 -; VI-NEXT: s_cmp_ge_u32 s15, s4 +; VI-NEXT: s_cmp_ge_u32 s14, s4 ; VI-NEXT: s_cselect_b32 s9, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s3, s5 ; VI-NEXT: s_cselect_b32 s3, s9, s8 ; VI-NEXT: s_cmp_lg_u32 s3, 0 -; VI-NEXT: s_cselect_b32 s9, s11, s13 -; VI-NEXT: s_cselect_b32 s8, s10, s12 +; VI-NEXT: s_cselect_b32 s9, s15, s11 +; VI-NEXT: s_cselect_b32 s8, s13, s10 ; VI-NEXT: s_cbranch_execnz .LBB16_4 ; VI-NEXT: .LBB16_2: ; VI-NEXT: v_cvt_f32_u32_e32 v0, s4 @@ -2311,8 +2278,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_u32 s10, 0, s6 -; GFX9-NEXT: s_subb_u32 s11, 0, s7 +; GFX9-NEXT: s_sub_u32 s8, 0, s6 +; GFX9-NEXT: s_subb_u32 s9, 0, s7 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2321,109 +2288,102 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s12, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s9, s10, s12 -; GFX9-NEXT: s_mul_hi_u32 s14, s10, s8 -; GFX9-NEXT: s_mul_i32 s13, s11, s8 -; GFX9-NEXT: s_add_i32 s9, s14, s9 -; GFX9-NEXT: s_add_i32 s9, s9, s13 -; GFX9-NEXT: s_mul_i32 s15, s10, s8 -; GFX9-NEXT: s_mul_i32 s14, s8, s9 -; GFX9-NEXT: s_mul_hi_u32 s16, s8, s15 -; GFX9-NEXT: s_mul_hi_u32 s13, s8, s9 +; GFX9-NEXT: v_readfirstlane_b32 s10, v1 +; GFX9-NEXT: v_readfirstlane_b32 s11, v0 +; GFX9-NEXT: s_mul_i32 s12, s8, s10 +; GFX9-NEXT: s_mul_hi_u32 s14, s8, s11 +; GFX9-NEXT: s_mul_i32 s13, s9, s11 +; GFX9-NEXT: s_add_i32 s12, s14, s12 +; GFX9-NEXT: s_add_i32 s12, s12, s13 +; GFX9-NEXT: s_mul_i32 s15, s8, s11 +; GFX9-NEXT: s_mul_i32 s14, s11, s12 +; GFX9-NEXT: s_mul_hi_u32 s16, s11, s15 +; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12 ; GFX9-NEXT: s_add_u32 s14, s16, s14 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15 -; GFX9-NEXT: s_mul_i32 s15, s12, s15 +; GFX9-NEXT: s_mul_hi_u32 s17, s10, s15 +; GFX9-NEXT: s_mul_i32 s15, s10, s15 ; GFX9-NEXT: s_add_u32 s14, s14, s15 -; GFX9-NEXT: s_mul_hi_u32 s16, s12, s9 +; GFX9-NEXT: s_mul_hi_u32 s16, s10, s12 ; GFX9-NEXT: s_addc_u32 s13, s13, s17 ; GFX9-NEXT: s_addc_u32 s14, s16, 0 -; GFX9-NEXT: s_mul_i32 s9, s12, s9 -; GFX9-NEXT: s_add_u32 s9, s13, s9 +; GFX9-NEXT: s_mul_i32 s12, s10, s12 +; GFX9-NEXT: s_add_u32 s12, s13, s12 ; GFX9-NEXT: s_addc_u32 s13, 0, s14 -; GFX9-NEXT: s_add_u32 s14, s8, s9 -; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 -; GFX9-NEXT: s_addc_u32 s12, s12, s13 -; GFX9-NEXT: s_mul_i32 s8, s10, s12 -; GFX9-NEXT: s_mul_hi_u32 s9, s10, s14 -; GFX9-NEXT: s_add_i32 s8, s9, s8 -; GFX9-NEXT: s_mul_i32 s11, s11, s14 -; GFX9-NEXT: s_add_i32 s8, s8, s11 -; GFX9-NEXT: s_mul_i32 s10, s10, s14 -; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10 -; GFX9-NEXT: s_mul_i32 s13, s12, s10 -; GFX9-NEXT: s_mul_i32 s16, s14, s8 -; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10 -; GFX9-NEXT: s_mul_hi_u32 s15, s14, s8 -; GFX9-NEXT: s_add_u32 s10, s10, s16 +; GFX9-NEXT: s_add_u32 s11, s11, s12 +; GFX9-NEXT: s_addc_u32 s10, s10, s13 +; GFX9-NEXT: s_mul_i32 s12, s8, s10 +; GFX9-NEXT: s_mul_hi_u32 s13, s8, s11 +; GFX9-NEXT: s_add_i32 s12, s13, s12 +; GFX9-NEXT: s_mul_i32 s9, s9, s11 +; GFX9-NEXT: s_add_i32 s12, s12, s9 +; GFX9-NEXT: s_mul_i32 s8, s8, s11 +; GFX9-NEXT: s_mul_hi_u32 s13, s10, s8 +; GFX9-NEXT: s_mul_i32 s14, s10, s8 +; GFX9-NEXT: s_mul_i32 s16, s11, s12 +; GFX9-NEXT: s_mul_hi_u32 s8, s11, s8 +; GFX9-NEXT: s_mul_hi_u32 s15, s11, s12 +; GFX9-NEXT: s_add_u32 s8, s8, s16 ; GFX9-NEXT: s_addc_u32 s15, 0, s15 -; GFX9-NEXT: s_add_u32 s10, s10, s13 -; GFX9-NEXT: s_mul_hi_u32 s9, s12, s8 -; GFX9-NEXT: s_addc_u32 s10, s15, s11 +; GFX9-NEXT: s_add_u32 s8, s8, s14 +; GFX9-NEXT: s_mul_hi_u32 s9, s10, s12 +; GFX9-NEXT: s_addc_u32 s8, s15, s13 ; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: s_mul_i32 s8, s12, s8 -; GFX9-NEXT: s_add_u32 s8, s10, s8 -; GFX9-NEXT: s_addc_u32 s10, 0, s9 -; GFX9-NEXT: s_add_u32 s11, s14, s8 -; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 -; GFX9-NEXT: s_addc_u32 s8, s12, s10 -; GFX9-NEXT: s_mul_i32 s10, s2, s8 -; GFX9-NEXT: s_mul_hi_u32 s12, s2, s11 -; GFX9-NEXT: s_mul_hi_u32 s9, s2, s8 -; GFX9-NEXT: s_add_u32 s10, s12, s10 +; GFX9-NEXT: s_mul_i32 s12, s10, s12 +; GFX9-NEXT: s_add_u32 s8, s8, s12 ; GFX9-NEXT: s_addc_u32 s9, 0, s9 -; GFX9-NEXT: s_mul_hi_u32 s13, s3, s11 -; GFX9-NEXT: s_mul_i32 s11, s3, s11 -; GFX9-NEXT: s_add_u32 s10, s10, s11 -; GFX9-NEXT: s_mul_hi_u32 s12, s3, s8 -; GFX9-NEXT: s_addc_u32 s9, s9, s13 -; GFX9-NEXT: s_addc_u32 s10, s12, 0 +; GFX9-NEXT: s_add_u32 s8, s11, s8 +; GFX9-NEXT: s_addc_u32 s9, s10, s9 +; GFX9-NEXT: s_mul_i32 s11, s2, s9 +; GFX9-NEXT: s_mul_hi_u32 s12, s2, s8 +; GFX9-NEXT: s_mul_hi_u32 s10, s2, s9 +; GFX9-NEXT: s_add_u32 s11, s12, s11 +; GFX9-NEXT: s_addc_u32 s10, 0, s10 +; GFX9-NEXT: s_mul_hi_u32 s13, s3, s8 ; GFX9-NEXT: s_mul_i32 s8, s3, s8 -; GFX9-NEXT: s_add_u32 s12, s9, s8 -; GFX9-NEXT: s_addc_u32 s13, 0, s10 -; GFX9-NEXT: s_mul_i32 s8, s6, s13 -; GFX9-NEXT: s_mul_hi_u32 s9, s6, s12 +; GFX9-NEXT: s_add_u32 s8, s11, s8 +; GFX9-NEXT: s_mul_hi_u32 s12, s3, s9 +; GFX9-NEXT: s_addc_u32 s8, s10, s13 +; GFX9-NEXT: s_addc_u32 s10, s12, 0 +; GFX9-NEXT: s_mul_i32 s9, s3, s9 +; GFX9-NEXT: s_add_u32 s11, s8, s9 +; GFX9-NEXT: s_addc_u32 s10, 0, s10 +; GFX9-NEXT: s_mul_i32 s8, s6, s10 +; GFX9-NEXT: s_mul_hi_u32 s9, s6, s11 ; GFX9-NEXT: s_add_i32 s8, s9, s8 -; GFX9-NEXT: s_mul_i32 s9, s7, s12 -; GFX9-NEXT: s_add_i32 s14, s8, s9 -; GFX9-NEXT: s_sub_i32 s10, s3, s14 -; GFX9-NEXT: s_mul_i32 s8, s6, s12 -; GFX9-NEXT: s_sub_u32 s15, s2, s8 +; GFX9-NEXT: s_mul_i32 s9, s7, s11 +; GFX9-NEXT: s_add_i32 s12, s8, s9 +; GFX9-NEXT: s_sub_i32 s13, s3, s12 +; GFX9-NEXT: s_mul_i32 s8, s6, s11 +; GFX9-NEXT: s_sub_u32 s14, s2, s8 ; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 -; GFX9-NEXT: s_subb_u32 s16, s10, s7 -; GFX9-NEXT: s_sub_u32 s17, s15, s6 -; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s10, s16, 0 -; GFX9-NEXT: s_cmp_ge_u32 s10, s7 -; GFX9-NEXT: s_cselect_b32 s11, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s17, s6 +; GFX9-NEXT: s_subb_u32 s13, s13, s7 +; GFX9-NEXT: s_sub_u32 s15, s14, s6 +; GFX9-NEXT: s_subb_u32 s13, s13, 0 +; GFX9-NEXT: s_cmp_ge_u32 s13, s7 ; GFX9-NEXT: s_cselect_b32 s16, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s10, s7 -; GFX9-NEXT: s_cselect_b32 s10, s16, s11 -; GFX9-NEXT: s_add_u32 s11, s12, 1 -; GFX9-NEXT: s_addc_u32 s16, s13, 0 -; GFX9-NEXT: s_add_u32 s17, s12, 2 -; GFX9-NEXT: s_addc_u32 s18, s13, 0 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 -; GFX9-NEXT: s_cselect_b32 s10, s17, s11 -; GFX9-NEXT: s_cselect_b32 s11, s18, s16 +; GFX9-NEXT: s_cmp_ge_u32 s15, s6 +; GFX9-NEXT: s_cselect_b32 s15, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s13, s7 +; GFX9-NEXT: s_cselect_b32 s13, s15, s16 +; GFX9-NEXT: s_add_u32 s15, s11, 1 +; GFX9-NEXT: s_addc_u32 s16, s10, 0 +; GFX9-NEXT: s_add_u32 s17, s11, 2 +; GFX9-NEXT: s_addc_u32 s18, s10, 0 +; GFX9-NEXT: s_cmp_lg_u32 s13, 0 +; GFX9-NEXT: s_cselect_b32 s13, s17, s15 +; GFX9-NEXT: s_cselect_b32 s15, s18, s16 ; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 -; GFX9-NEXT: s_subb_u32 s3, s3, s14 +; GFX9-NEXT: s_subb_u32 s3, s3, s12 ; GFX9-NEXT: s_cmp_ge_u32 s3, s7 ; GFX9-NEXT: s_cselect_b32 s8, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s15, s6 +; GFX9-NEXT: s_cmp_ge_u32 s14, s6 ; GFX9-NEXT: s_cselect_b32 s9, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s3, s7 ; GFX9-NEXT: s_cselect_b32 s3, s9, s8 ; GFX9-NEXT: s_cmp_lg_u32 s3, 0 -; GFX9-NEXT: s_cselect_b32 s9, s11, s13 -; GFX9-NEXT: s_cselect_b32 s8, s10, s12 +; GFX9-NEXT: s_cselect_b32 s9, s15, s10 +; GFX9-NEXT: s_cselect_b32 s8, s13, s11 ; GFX9-NEXT: s_cbranch_execnz .LBB16_3 ; GFX9-NEXT: .LBB16_2: ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 @@ -2503,44 +2463,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_add_u32 s11, s12, s11 ; GFX1010-NEXT: s_addc_u32 s12, 0, s13 ; GFX1010-NEXT: s_add_u32 s8, s8, s11 -; GFX1010-NEXT: s_cselect_b32 s11, -1, 0 -; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s8 -; GFX1010-NEXT: s_cmp_lg_u32 s11, 0 -; GFX1010-NEXT: s_mul_i32 s11, s9, s8 ; GFX1010-NEXT: s_addc_u32 s5, s5, s12 -; GFX1010-NEXT: s_mul_i32 s10, s10, s8 +; GFX1010-NEXT: s_mul_hi_u32 s11, s9, s8 +; GFX1010-NEXT: s_mul_i32 s12, s9, s8 ; GFX1010-NEXT: s_mul_i32 s9, s9, s5 -; GFX1010-NEXT: s_mul_hi_u32 s12, s8, s11 -; GFX1010-NEXT: s_add_i32 s9, s13, s9 -; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s11 +; GFX1010-NEXT: s_mul_i32 s10, s10, s8 +; GFX1010-NEXT: s_add_i32 s9, s11, s9 +; GFX1010-NEXT: s_mul_i32 s11, s5, s12 ; GFX1010-NEXT: s_add_i32 s9, s9, s10 -; GFX1010-NEXT: s_mul_i32 s10, s5, s11 +; GFX1010-NEXT: s_mul_hi_u32 s10, s8, s12 ; GFX1010-NEXT: s_mul_i32 s15, s8, s9 ; GFX1010-NEXT: s_mul_hi_u32 s14, s8, s9 -; GFX1010-NEXT: s_add_u32 s12, s12, s15 +; GFX1010-NEXT: s_add_u32 s10, s10, s15 +; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s12 ; GFX1010-NEXT: s_addc_u32 s14, 0, s14 -; GFX1010-NEXT: s_mul_hi_u32 s11, s5, s9 -; GFX1010-NEXT: s_add_u32 s10, s12, s10 +; GFX1010-NEXT: s_mul_hi_u32 s12, s5, s9 +; GFX1010-NEXT: s_add_u32 s10, s10, s11 ; GFX1010-NEXT: s_mul_i32 s9, s5, s9 ; GFX1010-NEXT: s_addc_u32 s10, s14, s13 -; GFX1010-NEXT: s_addc_u32 s11, s11, 0 +; GFX1010-NEXT: s_addc_u32 s11, s12, 0 ; GFX1010-NEXT: s_add_u32 s9, s10, s9 ; GFX1010-NEXT: s_addc_u32 s10, 0, s11 ; GFX1010-NEXT: s_add_u32 s8, s8, s9 -; GFX1010-NEXT: s_cselect_b32 s9, -1, 0 -; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s8 -; GFX1010-NEXT: s_cmp_lg_u32 s9, 0 -; GFX1010-NEXT: s_mul_hi_u32 s9, s3, s8 ; GFX1010-NEXT: s_addc_u32 s5, s5, s10 -; GFX1010-NEXT: s_mul_i32 s8, s3, s8 +; GFX1010-NEXT: s_mul_hi_u32 s9, s2, s8 ; GFX1010-NEXT: s_mul_i32 s12, s2, s5 -; GFX1010-NEXT: s_mul_hi_u32 s10, s2, s5 -; GFX1010-NEXT: s_add_u32 s11, s11, s12 -; GFX1010-NEXT: s_addc_u32 s10, 0, s10 +; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s5 +; GFX1010-NEXT: s_mul_hi_u32 s10, s3, s8 +; GFX1010-NEXT: s_mul_i32 s8, s3, s8 +; GFX1010-NEXT: s_add_u32 s9, s9, s12 +; GFX1010-NEXT: s_addc_u32 s11, 0, s11 ; GFX1010-NEXT: s_mul_hi_u32 s13, s3, s5 -; GFX1010-NEXT: s_add_u32 s8, s11, s8 +; GFX1010-NEXT: s_add_u32 s8, s9, s8 ; GFX1010-NEXT: s_mul_i32 s5, s3, s5 -; GFX1010-NEXT: s_addc_u32 s8, s10, s9 +; GFX1010-NEXT: s_addc_u32 s8, s11, s10 ; GFX1010-NEXT: s_addc_u32 s9, s13, 0 ; GFX1010-NEXT: s_add_u32 s5, s8, s5 ; GFX1010-NEXT: s_addc_u32 s8, 0, s9 @@ -2553,11 +2509,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_sub_i32 s11, s3, s9 ; GFX1010-NEXT: s_sub_u32 s10, s2, s10 ; GFX1010-NEXT: s_cselect_b32 s12, -1, 0 -; GFX1010-NEXT: s_cmp_lg_u32 s12, 0 ; GFX1010-NEXT: s_subb_u32 s11, s11, s7 ; GFX1010-NEXT: s_sub_u32 s13, s10, s6 -; GFX1010-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1010-NEXT: s_cmp_lg_u32 s14, 0 ; GFX1010-NEXT: s_subb_u32 s11, s11, 0 ; GFX1010-NEXT: s_cmp_ge_u32 s11, s7 ; GFX1010-NEXT: s_cselect_b32 s14, -1, 0 @@ -2663,44 +2616,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: s_add_u32 s11, s12, s11 ; GFX1030W32-NEXT: s_addc_u32 s12, 0, s13 ; GFX1030W32-NEXT: s_add_u32 s8, s8, s11 -; GFX1030W32-NEXT: s_cselect_b32 s11, -1, 0 -; GFX1030W32-NEXT: s_mul_hi_u32 s13, s9, s8 -; GFX1030W32-NEXT: s_cmp_lg_u32 s11, 0 -; GFX1030W32-NEXT: s_mul_i32 s11, s9, s8 ; GFX1030W32-NEXT: s_addc_u32 s7, s7, s12 -; GFX1030W32-NEXT: s_mul_i32 s10, s10, s8 +; GFX1030W32-NEXT: s_mul_hi_u32 s11, s9, s8 +; GFX1030W32-NEXT: s_mul_i32 s12, s9, s8 ; GFX1030W32-NEXT: s_mul_i32 s9, s9, s7 -; GFX1030W32-NEXT: s_mul_hi_u32 s12, s8, s11 -; GFX1030W32-NEXT: s_add_i32 s9, s13, s9 -; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s11 +; GFX1030W32-NEXT: s_mul_i32 s10, s10, s8 +; GFX1030W32-NEXT: s_add_i32 s9, s11, s9 +; GFX1030W32-NEXT: s_mul_i32 s11, s7, s12 ; GFX1030W32-NEXT: s_add_i32 s9, s9, s10 -; GFX1030W32-NEXT: s_mul_i32 s10, s7, s11 +; GFX1030W32-NEXT: s_mul_hi_u32 s10, s8, s12 ; GFX1030W32-NEXT: s_mul_i32 s15, s8, s9 ; GFX1030W32-NEXT: s_mul_hi_u32 s14, s8, s9 -; GFX1030W32-NEXT: s_add_u32 s12, s12, s15 +; GFX1030W32-NEXT: s_add_u32 s10, s10, s15 +; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s12 ; GFX1030W32-NEXT: s_addc_u32 s14, 0, s14 -; GFX1030W32-NEXT: s_mul_hi_u32 s11, s7, s9 -; GFX1030W32-NEXT: s_add_u32 s10, s12, s10 +; GFX1030W32-NEXT: s_mul_hi_u32 s12, s7, s9 +; GFX1030W32-NEXT: s_add_u32 s10, s10, s11 ; GFX1030W32-NEXT: s_mul_i32 s9, s7, s9 ; GFX1030W32-NEXT: s_addc_u32 s10, s14, s13 -; GFX1030W32-NEXT: s_addc_u32 s11, s11, 0 +; GFX1030W32-NEXT: s_addc_u32 s11, s12, 0 ; GFX1030W32-NEXT: s_add_u32 s9, s10, s9 ; GFX1030W32-NEXT: s_addc_u32 s10, 0, s11 ; GFX1030W32-NEXT: s_add_u32 s8, s8, s9 -; GFX1030W32-NEXT: s_cselect_b32 s9, -1, 0 -; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s8 -; GFX1030W32-NEXT: s_cmp_lg_u32 s9, 0 -; GFX1030W32-NEXT: s_mul_hi_u32 s9, s3, s8 ; GFX1030W32-NEXT: s_addc_u32 s7, s7, s10 -; GFX1030W32-NEXT: s_mul_i32 s8, s3, s8 +; GFX1030W32-NEXT: s_mul_hi_u32 s9, s2, s8 ; GFX1030W32-NEXT: s_mul_i32 s12, s2, s7 -; GFX1030W32-NEXT: s_mul_hi_u32 s10, s2, s7 -; GFX1030W32-NEXT: s_add_u32 s11, s11, s12 -; GFX1030W32-NEXT: s_addc_u32 s10, 0, s10 +; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s7 +; GFX1030W32-NEXT: s_mul_hi_u32 s10, s3, s8 +; GFX1030W32-NEXT: s_mul_i32 s8, s3, s8 +; GFX1030W32-NEXT: s_add_u32 s9, s9, s12 +; GFX1030W32-NEXT: s_addc_u32 s11, 0, s11 ; GFX1030W32-NEXT: s_mul_hi_u32 s13, s3, s7 -; GFX1030W32-NEXT: s_add_u32 s8, s11, s8 +; GFX1030W32-NEXT: s_add_u32 s8, s9, s8 ; GFX1030W32-NEXT: s_mul_i32 s7, s3, s7 -; GFX1030W32-NEXT: s_addc_u32 s8, s10, s9 +; GFX1030W32-NEXT: s_addc_u32 s8, s11, s10 ; GFX1030W32-NEXT: s_addc_u32 s9, s13, 0 ; GFX1030W32-NEXT: s_add_u32 s7, s8, s7 ; GFX1030W32-NEXT: s_addc_u32 s8, 0, s9 @@ -2713,11 +2662,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: s_sub_i32 s11, s3, s9 ; GFX1030W32-NEXT: s_sub_u32 s10, s2, s10 ; GFX1030W32-NEXT: s_cselect_b32 s12, -1, 0 -; GFX1030W32-NEXT: s_cmp_lg_u32 s12, 0 ; GFX1030W32-NEXT: s_subb_u32 s11, s11, s5 ; GFX1030W32-NEXT: s_sub_u32 s13, s10, s4 -; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1030W32-NEXT: s_cmp_lg_u32 s14, 0 ; GFX1030W32-NEXT: s_subb_u32 s11, s11, 0 ; GFX1030W32-NEXT: s_cmp_ge_u32 s11, s5 ; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0 @@ -2790,8 +2736,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W64-NEXT: ; %bb.1: ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GFX1030W64-NEXT: s_sub_u32 s9, 0, s4 -; GFX1030W64-NEXT: s_subb_u32 s10, 0, s5 +; GFX1030W64-NEXT: s_sub_u32 s8, 0, s4 +; GFX1030W64-NEXT: s_subb_u32 s9, 0, s5 ; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0 ; GFX1030W64-NEXT: v_rcp_f32_e32 v0, v0 ; GFX1030W64-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2800,109 +2746,102 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0 ; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1030W64-NEXT: v_readfirstlane_b32 s8, v1 -; GFX1030W64-NEXT: v_readfirstlane_b32 s6, v0 -; GFX1030W64-NEXT: s_mul_i32 s7, s9, s8 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s9, s6 -; GFX1030W64-NEXT: s_mul_i32 s11, s10, s6 -; GFX1030W64-NEXT: s_add_i32 s7, s12, s7 -; GFX1030W64-NEXT: s_mul_i32 s13, s9, s6 -; GFX1030W64-NEXT: s_add_i32 s7, s7, s11 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s13 -; GFX1030W64-NEXT: s_mul_i32 s15, s6, s7 -; GFX1030W64-NEXT: s_mul_hi_u32 s14, s8, s13 -; GFX1030W64-NEXT: s_mul_i32 s11, s8, s13 -; GFX1030W64-NEXT: s_mul_hi_u32 s13, s6, s7 +; GFX1030W64-NEXT: v_readfirstlane_b32 s6, v1 +; GFX1030W64-NEXT: v_readfirstlane_b32 s7, v0 +; GFX1030W64-NEXT: s_mul_i32 s10, s8, s6 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s8, s7 +; GFX1030W64-NEXT: s_mul_i32 s11, s9, s7 +; GFX1030W64-NEXT: s_add_i32 s10, s12, s10 +; GFX1030W64-NEXT: s_mul_i32 s13, s8, s7 +; GFX1030W64-NEXT: s_add_i32 s10, s10, s11 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s7, s13 +; GFX1030W64-NEXT: s_mul_i32 s15, s7, s10 +; GFX1030W64-NEXT: s_mul_hi_u32 s14, s6, s13 +; GFX1030W64-NEXT: s_mul_i32 s11, s6, s13 +; GFX1030W64-NEXT: s_mul_hi_u32 s13, s7, s10 ; GFX1030W64-NEXT: s_add_u32 s12, s12, s15 ; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13 -; GFX1030W64-NEXT: s_mul_hi_u32 s16, s8, s7 +; GFX1030W64-NEXT: s_mul_hi_u32 s16, s6, s10 ; GFX1030W64-NEXT: s_add_u32 s11, s12, s11 -; GFX1030W64-NEXT: s_mul_i32 s7, s8, s7 +; GFX1030W64-NEXT: s_mul_i32 s10, s6, s10 ; GFX1030W64-NEXT: s_addc_u32 s11, s13, s14 ; GFX1030W64-NEXT: s_addc_u32 s12, s16, 0 -; GFX1030W64-NEXT: s_add_u32 s7, s11, s7 +; GFX1030W64-NEXT: s_add_u32 s10, s11, s10 ; GFX1030W64-NEXT: s_addc_u32 s11, 0, s12 -; GFX1030W64-NEXT: s_add_u32 s12, s6, s7 -; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX1030W64-NEXT: s_mul_hi_u32 s13, s9, s12 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1030W64-NEXT: s_mul_i32 s6, s9, s12 -; GFX1030W64-NEXT: s_addc_u32 s8, s8, s11 -; GFX1030W64-NEXT: s_mul_i32 s10, s10, s12 -; GFX1030W64-NEXT: s_mul_i32 s9, s9, s8 -; GFX1030W64-NEXT: s_mul_hi_u32 s7, s12, s6 -; GFX1030W64-NEXT: s_add_i32 s9, s13, s9 -; GFX1030W64-NEXT: s_mul_hi_u32 s11, s8, s6 -; GFX1030W64-NEXT: s_add_i32 s9, s9, s10 -; GFX1030W64-NEXT: s_mul_i32 s6, s8, s6 -; GFX1030W64-NEXT: s_mul_i32 s14, s12, s9 -; GFX1030W64-NEXT: s_mul_hi_u32 s13, s12, s9 -; GFX1030W64-NEXT: s_add_u32 s7, s7, s14 +; GFX1030W64-NEXT: s_add_u32 s7, s7, s10 +; GFX1030W64-NEXT: s_addc_u32 s6, s6, s11 +; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s7 +; GFX1030W64-NEXT: s_mul_i32 s11, s8, s7 +; GFX1030W64-NEXT: s_mul_i32 s8, s8, s6 +; GFX1030W64-NEXT: s_mul_i32 s9, s9, s7 +; GFX1030W64-NEXT: s_add_i32 s8, s10, s8 +; GFX1030W64-NEXT: s_mul_i32 s10, s6, s11 +; GFX1030W64-NEXT: s_add_i32 s8, s8, s9 +; GFX1030W64-NEXT: s_mul_hi_u32 s9, s7, s11 +; GFX1030W64-NEXT: s_mul_i32 s14, s7, s8 +; GFX1030W64-NEXT: s_mul_hi_u32 s13, s7, s8 +; GFX1030W64-NEXT: s_add_u32 s9, s9, s14 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s11 ; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13 -; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s9 -; GFX1030W64-NEXT: s_add_u32 s6, s7, s6 -; GFX1030W64-NEXT: s_mul_i32 s9, s8, s9 -; GFX1030W64-NEXT: s_addc_u32 s6, s13, s11 -; GFX1030W64-NEXT: s_addc_u32 s7, s10, 0 -; GFX1030W64-NEXT: s_add_u32 s6, s6, s9 -; GFX1030W64-NEXT: s_addc_u32 s9, 0, s7 -; GFX1030W64-NEXT: s_add_u32 s10, s12, s6 -; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX1030W64-NEXT: s_mul_hi_u32 s11, s2, s10 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1030W64-NEXT: s_mul_hi_u32 s6, s3, s10 -; GFX1030W64-NEXT: s_addc_u32 s7, s8, s9 -; GFX1030W64-NEXT: s_mul_i32 s8, s3, s10 -; GFX1030W64-NEXT: s_mul_i32 s10, s2, s7 -; GFX1030W64-NEXT: s_mul_hi_u32 s9, s2, s7 -; GFX1030W64-NEXT: s_add_u32 s10, s11, s10 -; GFX1030W64-NEXT: s_addc_u32 s9, 0, s9 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s3, s7 -; GFX1030W64-NEXT: s_add_u32 s8, s10, s8 +; GFX1030W64-NEXT: s_mul_hi_u32 s11, s6, s8 +; GFX1030W64-NEXT: s_add_u32 s9, s9, s10 +; GFX1030W64-NEXT: s_mul_i32 s8, s6, s8 +; GFX1030W64-NEXT: s_addc_u32 s9, s13, s12 +; GFX1030W64-NEXT: s_addc_u32 s10, s11, 0 +; GFX1030W64-NEXT: s_add_u32 s8, s9, s8 +; GFX1030W64-NEXT: s_addc_u32 s9, 0, s10 +; GFX1030W64-NEXT: s_add_u32 s7, s7, s8 +; GFX1030W64-NEXT: s_addc_u32 s6, s6, s9 +; GFX1030W64-NEXT: s_mul_hi_u32 s8, s2, s7 +; GFX1030W64-NEXT: s_mul_i32 s11, s2, s6 +; GFX1030W64-NEXT: s_mul_hi_u32 s10, s2, s6 +; GFX1030W64-NEXT: s_mul_hi_u32 s9, s3, s7 ; GFX1030W64-NEXT: s_mul_i32 s7, s3, s7 -; GFX1030W64-NEXT: s_addc_u32 s6, s9, s6 +; GFX1030W64-NEXT: s_add_u32 s8, s8, s11 +; GFX1030W64-NEXT: s_addc_u32 s10, 0, s10 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s3, s6 +; GFX1030W64-NEXT: s_add_u32 s7, s8, s7 +; GFX1030W64-NEXT: s_mul_i32 s6, s3, s6 +; GFX1030W64-NEXT: s_addc_u32 s7, s10, s9 ; GFX1030W64-NEXT: s_addc_u32 s8, s12, 0 -; GFX1030W64-NEXT: s_add_u32 s10, s6, s7 +; GFX1030W64-NEXT: s_add_u32 s10, s7, s6 ; GFX1030W64-NEXT: s_addc_u32 s11, 0, s8 ; GFX1030W64-NEXT: s_mul_hi_u32 s6, s4, s10 ; GFX1030W64-NEXT: s_mul_i32 s7, s4, s11 ; GFX1030W64-NEXT: s_mul_i32 s8, s5, s10 ; GFX1030W64-NEXT: s_add_i32 s6, s6, s7 -; GFX1030W64-NEXT: s_add_i32 s12, s6, s8 +; GFX1030W64-NEXT: s_add_i32 s8, s6, s8 ; GFX1030W64-NEXT: s_mul_i32 s6, s4, s10 -; GFX1030W64-NEXT: s_sub_i32 s8, s3, s12 -; GFX1030W64-NEXT: s_sub_u32 s13, s2, s6 +; GFX1030W64-NEXT: s_sub_i32 s9, s3, s8 +; GFX1030W64-NEXT: s_sub_u32 s12, s2, s6 ; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1030W64-NEXT: s_subb_u32 s14, s8, s5 -; GFX1030W64-NEXT: s_sub_u32 s15, s13, s4 -; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0 -; GFX1030W64-NEXT: s_subb_u32 s8, s14, 0 -; GFX1030W64-NEXT: s_cmp_ge_u32 s8, s5 -; GFX1030W64-NEXT: s_cselect_b32 s9, -1, 0 -; GFX1030W64-NEXT: s_cmp_ge_u32 s15, s4 +; GFX1030W64-NEXT: s_subb_u32 s9, s9, s5 +; GFX1030W64-NEXT: s_sub_u32 s13, s12, s4 +; GFX1030W64-NEXT: s_subb_u32 s9, s9, 0 +; GFX1030W64-NEXT: s_cmp_ge_u32 s9, s5 ; GFX1030W64-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1030W64-NEXT: s_cmp_eq_u32 s8, s5 -; GFX1030W64-NEXT: s_cselect_b32 s8, s14, s9 -; GFX1030W64-NEXT: s_add_u32 s9, s10, 1 +; GFX1030W64-NEXT: s_cmp_ge_u32 s13, s4 +; GFX1030W64-NEXT: s_cselect_b32 s13, -1, 0 +; GFX1030W64-NEXT: s_cmp_eq_u32 s9, s5 +; GFX1030W64-NEXT: s_cselect_b32 s9, s13, s14 +; GFX1030W64-NEXT: s_add_u32 s13, s10, 1 ; GFX1030W64-NEXT: s_addc_u32 s14, s11, 0 ; GFX1030W64-NEXT: s_add_u32 s15, s10, 2 ; GFX1030W64-NEXT: s_addc_u32 s16, s11, 0 -; GFX1030W64-NEXT: s_cmp_lg_u32 s8, 0 -; GFX1030W64-NEXT: s_cselect_b32 s15, s15, s9 +; GFX1030W64-NEXT: s_cmp_lg_u32 s9, 0 +; GFX1030W64-NEXT: s_cselect_b32 s13, s15, s13 ; GFX1030W64-NEXT: s_cselect_b32 s14, s16, s14 ; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1030W64-NEXT: s_subb_u32 s3, s3, s12 +; GFX1030W64-NEXT: s_subb_u32 s3, s3, s8 ; GFX1030W64-NEXT: s_cmp_ge_u32 s3, s5 ; GFX1030W64-NEXT: s_cselect_b32 s6, -1, 0 -; GFX1030W64-NEXT: s_cmp_ge_u32 s13, s4 +; GFX1030W64-NEXT: s_cmp_ge_u32 s12, s4 ; GFX1030W64-NEXT: s_cselect_b32 s7, -1, 0 ; GFX1030W64-NEXT: s_cmp_eq_u32 s3, s5 ; GFX1030W64-NEXT: s_cselect_b32 s3, s7, s6 ; GFX1030W64-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1030W64-NEXT: s_cselect_b32 s7, s14, s11 -; GFX1030W64-NEXT: s_cselect_b32 s6, s15, s10 +; GFX1030W64-NEXT: s_cselect_b32 s6, s13, s10 ; GFX1030W64-NEXT: s_cbranch_execnz .LBB16_3 ; GFX1030W64-NEXT: .LBB16_2: ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4 @@ -2988,44 +2927,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_add_u32 s11, s12, s11 ; GFX11-NEXT: s_addc_u32 s12, 0, s13 ; GFX11-NEXT: s_add_u32 s8, s8, s11 -; GFX11-NEXT: s_cselect_b32 s11, -1, 0 -; GFX11-NEXT: s_mul_hi_u32 s13, s9, s8 -; GFX11-NEXT: s_cmp_lg_u32 s11, 0 -; GFX11-NEXT: s_mul_i32 s11, s9, s8 ; GFX11-NEXT: s_addc_u32 s7, s7, s12 -; GFX11-NEXT: s_mul_i32 s10, s10, s8 +; GFX11-NEXT: s_mul_hi_u32 s11, s9, s8 +; GFX11-NEXT: s_mul_i32 s12, s9, s8 ; GFX11-NEXT: s_mul_i32 s9, s9, s7 -; GFX11-NEXT: s_mul_hi_u32 s12, s8, s11 -; GFX11-NEXT: s_add_i32 s9, s13, s9 -; GFX11-NEXT: s_mul_hi_u32 s13, s7, s11 +; GFX11-NEXT: s_mul_i32 s10, s10, s8 +; GFX11-NEXT: s_add_i32 s9, s11, s9 +; GFX11-NEXT: s_mul_i32 s11, s7, s12 ; GFX11-NEXT: s_add_i32 s9, s9, s10 -; GFX11-NEXT: s_mul_i32 s10, s7, s11 +; GFX11-NEXT: s_mul_hi_u32 s10, s8, s12 ; GFX11-NEXT: s_mul_i32 s15, s8, s9 ; GFX11-NEXT: s_mul_hi_u32 s14, s8, s9 -; GFX11-NEXT: s_add_u32 s12, s12, s15 +; GFX11-NEXT: s_add_u32 s10, s10, s15 +; GFX11-NEXT: s_mul_hi_u32 s13, s7, s12 ; GFX11-NEXT: s_addc_u32 s14, 0, s14 -; GFX11-NEXT: s_mul_hi_u32 s11, s7, s9 -; GFX11-NEXT: s_add_u32 s10, s12, s10 +; GFX11-NEXT: s_mul_hi_u32 s12, s7, s9 +; GFX11-NEXT: s_add_u32 s10, s10, s11 ; GFX11-NEXT: s_mul_i32 s9, s7, s9 ; GFX11-NEXT: s_addc_u32 s10, s14, s13 -; GFX11-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-NEXT: s_addc_u32 s11, s12, 0 ; GFX11-NEXT: s_add_u32 s9, s10, s9 ; GFX11-NEXT: s_addc_u32 s10, 0, s11 ; GFX11-NEXT: s_add_u32 s8, s8, s9 -; GFX11-NEXT: s_cselect_b32 s9, -1, 0 -; GFX11-NEXT: s_mul_hi_u32 s11, s2, s8 -; GFX11-NEXT: s_cmp_lg_u32 s9, 0 -; GFX11-NEXT: s_mul_hi_u32 s9, s3, s8 ; GFX11-NEXT: s_addc_u32 s7, s7, s10 -; GFX11-NEXT: s_mul_i32 s8, s3, s8 +; GFX11-NEXT: s_mul_hi_u32 s9, s2, s8 ; GFX11-NEXT: s_mul_i32 s12, s2, s7 -; GFX11-NEXT: s_mul_hi_u32 s10, s2, s7 -; GFX11-NEXT: s_add_u32 s11, s11, s12 -; GFX11-NEXT: s_addc_u32 s10, 0, s10 +; GFX11-NEXT: s_mul_hi_u32 s11, s2, s7 +; GFX11-NEXT: s_mul_hi_u32 s10, s3, s8 +; GFX11-NEXT: s_mul_i32 s8, s3, s8 +; GFX11-NEXT: s_add_u32 s9, s9, s12 +; GFX11-NEXT: s_addc_u32 s11, 0, s11 ; GFX11-NEXT: s_mul_hi_u32 s13, s3, s7 -; GFX11-NEXT: s_add_u32 s8, s11, s8 +; GFX11-NEXT: s_add_u32 s8, s9, s8 ; GFX11-NEXT: s_mul_i32 s7, s3, s7 -; GFX11-NEXT: s_addc_u32 s8, s10, s9 +; GFX11-NEXT: s_addc_u32 s8, s11, s10 ; GFX11-NEXT: s_addc_u32 s9, s13, 0 ; GFX11-NEXT: s_add_u32 s7, s8, s7 ; GFX11-NEXT: s_addc_u32 s8, 0, s9 @@ -3035,17 +2970,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_add_i32 s9, s9, s10 ; GFX11-NEXT: s_mul_i32 s10, s4, s7 ; GFX11-NEXT: s_add_i32 s9, s9, s11 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_sub_i32 s11, s3, s9 ; GFX11-NEXT: s_sub_u32 s10, s2, s10 ; GFX11-NEXT: s_cselect_b32 s12, -1, 0 -; GFX11-NEXT: s_cmp_lg_u32 s12, 0 ; GFX11-NEXT: s_subb_u32 s11, s11, s5 ; GFX11-NEXT: s_sub_u32 s13, s10, s4 -; GFX11-NEXT: s_cselect_b32 s14, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_lg_u32 s14, 0 ; GFX11-NEXT: s_subb_u32 s11, s11, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_cmp_ge_u32 s11, s5 ; GFX11-NEXT: s_cselect_b32 s14, -1, 0 ; GFX11-NEXT: s_cmp_ge_u32 s13, s4 @@ -3118,9 +3050,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_or_b64 s[4:5], s[2:3], s[6:7] -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_b64 s[4:5], s[4:5], 0xffffffff00000000 -; GFX1250-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1250-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1250-NEXT: ; %bb.1: ; GFX1250-NEXT: s_cvt_f32_u32 s4, s6 @@ -3155,12 +3086,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[4:5], s[12:13] ; GFX1250-NEXT: s_add_co_u32 s8, s8, s12 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1250-NEXT: s_add_co_ci_u32 s9, s9, s13 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_mul_u64 s[10:11], s[10:11], s[8:9] -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_mul_hi_u32 s13, s8, s11 ; GFX1250-NEXT: s_mul_i32 s12, s8, s11 ; GFX1250-NEXT: s_mul_hi_u32 s4, s8, s10 @@ -3175,19 +3103,17 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_add_nc_u64 s[10:11], s[4:5], s[10:11] ; GFX1250-NEXT: s_add_co_u32 s8, s8, s10 -; GFX1250-NEXT: s_cselect_b32 s10, -1, 0 -; GFX1250-NEXT: s_mul_hi_u32 s4, s2, s8 -; GFX1250-NEXT: s_cmp_lg_u32 s10, 0 -; GFX1250-NEXT: s_mul_hi_u32 s12, s3, s8 ; GFX1250-NEXT: s_add_co_ci_u32 s10, s9, s11 -; GFX1250-NEXT: s_mul_i32 s11, s3, s8 +; GFX1250-NEXT: s_mul_hi_u32 s4, s2, s8 +; GFX1250-NEXT: s_mul_hi_u32 s11, s3, s8 +; GFX1250-NEXT: s_mul_i32 s12, s3, s8 ; GFX1250-NEXT: s_mul_hi_u32 s9, s2, s10 ; GFX1250-NEXT: s_mul_i32 s8, s2, s10 ; GFX1250-NEXT: s_mul_hi_u32 s13, s3, s10 ; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[8:9] ; GFX1250-NEXT: s_mul_i32 s10, s3, s10 -; GFX1250-NEXT: s_add_co_u32 s4, s8, s11 -; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s12 +; GFX1250-NEXT: s_add_co_u32 s4, s8, s12 +; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s11 ; GFX1250-NEXT: s_add_co_ci_u32 s11, s13, 0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[10:11] @@ -3202,10 +3128,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_cmp_lg_u32 s8, 0 ; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, s7 ; GFX1250-NEXT: s_sub_co_u32 s13, s4, s6 -; GFX1250-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_cmp_lg_u32 s14, 0 ; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, 0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_cmp_ge_u32 s12, s7 ; GFX1250-NEXT: s_cselect_b32 s14, -1, 0 ; GFX1250-NEXT: s_cmp_ge_u32 s13, s6 diff --git a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll index 57a1e4c..ec92edb 100644 --- a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll +++ b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll @@ -3385,7 +3385,7 @@ declare half @llvm.canonicalize.f16(half) declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) attributes #0 = { nounwind "amdgpu-ieee"="false" } -attributes #1 = { nounwind "unsafe-fp-math"="true" "no-nans-fp-math"="true" } +attributes #1 = { nounwind "no-nans-fp-math"="true" } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GFX11NONANS-FAKE16: {{.*}} ; GFX11NONANS-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index 4b151b9..07e6a76 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -714,9 +714,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; VI-NEXT: s_lshl_b32 s2, s2, 8 ; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: s_lshl_b32 s3, s2, 16 -; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_flbit_i32_b32 s3, s3 -; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_cselect_b32 s2, s3, 32 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll index cefcbdd..fca57be 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -1491,7 +1491,6 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace( ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: s_cbranch_scc0 .LBB14_4 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_mov_b32 s11, 0xf000 @@ -1521,7 +1520,6 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace( ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s6, 16 -; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: s_cbranch_scc0 .LBB14_4 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: s_mov_b32 s11, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll index d8a5e7fa..dbdea8e 100644 --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -14,7 +14,6 @@ define i32 @s_add_co_select_user() { ; GFX7-NEXT: s_add_u32 s7, s6, s6 ; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX7-NEXT: s_or_b32 s4, s4, s5 -; GFX7-NEXT: s_cmp_lg_u32 s4, 0 ; GFX7-NEXT: s_addc_u32 s8, s6, 0 ; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec @@ -31,8 +30,6 @@ define i32 @s_add_co_select_user() { ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s7, s6, s6 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9-NEXT: s_addc_u32 s8, s6, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec @@ -49,8 +46,6 @@ define i32 @s_add_co_select_user() { ; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s5, s4, s4 -; GFX10-NEXT: s_cselect_b32 s6, -1, 0 -; GFX10-NEXT: s_cmp_lg_u32 s6, 0 ; GFX10-NEXT: s_addc_u32 s6, s4, 0 ; GFX10-NEXT: s_cselect_b32 s7, -1, 0 ; GFX10-NEXT: s_and_b32 s7, s7, exec_lo @@ -67,16 +62,13 @@ define i32 @s_add_co_select_user() { ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s1, s0, s0 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_addc_u32 s2, s0, 0 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s3, s3, exec_lo ; GFX11-NEXT: s_cselect_b32 s2, s2, 0 ; GFX11-NEXT: s_cmp_gt_u32 s0, 31 ; GFX11-NEXT: s_cselect_b32 s0, s1, s2 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] bb: @@ -104,7 +96,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX7-NEXT: s_add_u32 s0, s2, s2 ; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_cmp_lg_u32 s0, 0 ; GFX7-NEXT: s_addc_u32 s0, s2, 0 ; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX7-NEXT: s_andn2_b64 vcc, exec, s[0:1] @@ -125,12 +116,10 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; ; GFX9-LABEL: s_add_co_br_user: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s2, s2 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_addc_u32 s0, s2, 0 +; GFX9-NEXT: s_add_u32 s1, s0, s0 +; GFX9-NEXT: s_addc_u32 s0, s0, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GFX9-NEXT: s_cbranch_vccnz .LBB1_2 @@ -153,8 +142,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s1, s0, s0 -; GFX10-NEXT: s_cselect_b32 s1, -1, 0 -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: s_addc_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s0, -1, 0 ; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s0 @@ -178,11 +165,9 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s1, s0, s0 -; GFX11-NEXT: s_cselect_b32 s1, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-NEXT: s_addc_u32 s0, s0, 0 ; GFX11-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_vccnz .LBB1_2 ; GFX11-NEXT: ; %bb.1: ; %bb0 diff --git a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll index 13206ad..f45070c 100644 --- a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s -; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s +; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s +; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s -; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s -; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s +; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s +; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s ; FIXME: This should also fold when fma is actually fast if an FMA ; exists in the original program. diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index 62847b1..9a17538 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -1117,7 +1117,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; SI: ; %bb.0: ; SI-NEXT: s_and_b32 s3, s1, 0x1ff ; SI-NEXT: s_or_b32 s0, s3, s0 -; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; SI-NEXT: s_lshr_b32 s0, s1, 8 @@ -1169,7 +1168,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; VI: ; %bb.0: ; VI-NEXT: s_and_b32 s3, s1, 0x1ff ; VI-NEXT: s_or_b32 s0, s3, s0 -; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; VI-NEXT: s_lshr_b32 s0, s1, 8 @@ -1217,7 +1215,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s3, s1, 0x1ff ; GFX9-NEXT: s_or_b32 s0, s3, s0 -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX9-NEXT: s_lshr_b32 s0, s1, 8 @@ -1264,11 +1261,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_and_b32 s3, s1, 0x1ff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_or_b32 s0, s3, s0 -; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-TRUE16-NEXT: s_cselect_b32 s0, -1, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX11-TRUE16-NEXT: s_bfe_u32 s0, s1, 0xb0014 ; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 8 @@ -1320,11 +1315,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_and_b32 s3, s1, 0x1ff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_or_b32 s0, s3, s0 -; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-FAKE16-NEXT: s_cselect_b32 s0, -1, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX11-FAKE16-NEXT: s_bfe_u32 s0, s1, 0xb0014 ; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 8 @@ -4023,7 +4016,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; SI-NEXT: s_and_b32 s6, s4, 0xffe ; SI-NEXT: s_and_b32 s4, s1, 0x1ff ; SI-NEXT: s_or_b32 s0, s4, s0 -; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s5 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] @@ -4066,7 +4058,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; SI-NEXT: s_and_b32 s5, s0, 0xffe ; SI-NEXT: s_and_b32 s0, s3, 0x1ff ; SI-NEXT: s_or_b32 s0, s0, s2 -; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; SI-NEXT: v_readfirstlane_b32 s0, v2 @@ -4120,10 +4111,9 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; VI-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_lshr_b32 s5, s3, 8 -; VI-NEXT: s_and_b32 s6, s3, 0x1ff ; VI-NEXT: s_and_b32 s5, s5, 0xffe +; VI-NEXT: s_and_b32 s6, s3, 0x1ff ; VI-NEXT: s_or_b32 s2, s6, s2 -; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7] ; VI-NEXT: s_bfe_u32 s3, s3, 0xb0014 @@ -4163,7 +4153,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; VI-NEXT: s_and_b32 s7, s2, 0xffe ; VI-NEXT: s_and_b32 s2, s1, 0x1ff ; VI-NEXT: s_or_b32 s0, s2, s0 -; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; VI-NEXT: s_bfe_u32 s1, s1, 0xb0014 @@ -4209,10 +4198,9 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; GFX9-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s5, s3, 8 -; GFX9-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX9-NEXT: s_and_b32 s5, s5, 0xffe +; GFX9-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX9-NEXT: s_or_b32 s2, s6, s2 -; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7] ; GFX9-NEXT: s_bfe_u32 s6, s3, 0xb0014 @@ -4254,7 +4242,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; GFX9-NEXT: s_and_b32 s6, s2, 0xffe ; GFX9-NEXT: s_and_b32 s2, s1, 0x1ff ; GFX9-NEXT: s_or_b32 s0, s2, s0 -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 @@ -4301,11 +4288,10 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; ; GFX11-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s5, s3, 0x1ff -; GFX11-NEXT: s_lshr_b32 s6, s3, 8 -; GFX11-NEXT: s_or_b32 s2, s5, s2 -; GFX11-NEXT: s_and_b32 s5, s6, 0xffe -; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_lshr_b32 s5, s3, 8 +; GFX11-NEXT: s_and_b32 s6, s3, 0x1ff +; GFX11-NEXT: s_and_b32 s5, s5, 0xffe +; GFX11-NEXT: s_or_b32 s2, s6, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 @@ -4348,13 +4334,12 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x40f ; GFX11-NEXT: s_cselect_b32 s2, s5, s6 ; GFX11-NEXT: s_lshr_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s6, s1, 0x1ff ; GFX11-NEXT: s_lshr_b32 s5, s1, 8 ; GFX11-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX11-NEXT: s_or_b32 s0, s6, s0 +; GFX11-NEXT: s_and_b32 s6, s1, 0x1ff ; GFX11-NEXT: s_and_b32 s5, s5, 0xffe ; GFX11-NEXT: s_or_b32 s2, s3, s2 -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_or_b32 s0, s6, s0 ; GFX11-NEXT: s_cselect_b32 s0, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll index acb32d4..11476a6 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll @@ -127,7 +127,7 @@ define amdgpu_kernel void @s_fdiv_v4f64(ptr addrspace(1) %out, <4 x double> %num ; GCN-LABEL: {{^}}div_fast_2_x_pat_f64: ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0.5 ; GCN: buffer_store_dwordx2 [[MUL]] -define amdgpu_kernel void @div_fast_2_x_pat_f64(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @div_fast_2_x_pat_f64(ptr addrspace(1) %out) #0 { %x = load double, ptr addrspace(1) poison %rcp = fdiv fast double %x, 2.0 store double %rcp, ptr addrspace(1) %out, align 4 @@ -139,7 +139,7 @@ define amdgpu_kernel void @div_fast_2_x_pat_f64(ptr addrspace(1) %out) #1 { ; GCN-DAG: v_mov_b32_e32 v[[K_HI:[0-9]+]], 0x3fb99999 ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, v[[[K_LO]]:[[K_HI]]] ; GCN: buffer_store_dwordx2 [[MUL]] -define amdgpu_kernel void @div_fast_k_x_pat_f64(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @div_fast_k_x_pat_f64(ptr addrspace(1) %out) #0 { %x = load double, ptr addrspace(1) poison %rcp = fdiv fast double %x, 10.0 store double %rcp, ptr addrspace(1) %out, align 4 @@ -151,7 +151,7 @@ define amdgpu_kernel void @div_fast_k_x_pat_f64(ptr addrspace(1) %out) #1 { ; GCN-DAG: v_mov_b32_e32 v[[K_HI:[0-9]+]], 0xbfb99999 ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, v[[[K_LO]]:[[K_HI]]] ; GCN: buffer_store_dwordx2 [[MUL]] -define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(ptr addrspace(1) %out) #0 { %x = load double, ptr addrspace(1) poison %rcp = fdiv fast double %x, -10.0 store double %rcp, ptr addrspace(1) %out, align 4 @@ -159,4 +159,3 @@ define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(ptr addrspace(1) %out) #1 { } attributes #0 = { nounwind } -attributes #1 = { nounwind "unsafe-fp-math"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll b/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll index 92eb4a6..0a266bc 100644 --- a/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll +++ b/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll @@ -284,4 +284,4 @@ define <2 x float> @unsafe_fast_fmul_fsub_ditribute_post_legalize(float %arg0, < ret <2 x float> %tmp1 } -attributes #0 = { "no-infs-fp-math"="true" "unsafe-fp-math"="true" } +attributes #0 = { "no-infs-fp-math"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll b/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll index bc85dc2..3e513de 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll @@ -219,8 +219,8 @@ define <2 x bfloat> @v_test_fmed3_r_i_i_v2bf16_minimumnum_maximumnum(<2 x bfloat } attributes #0 = { nounwind readnone } -attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" } -attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" } +attributes #1 = { nounwind "no-nans-fp-math"="false" } +attributes #2 = { nounwind "no-nans-fp-math"="true" } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GFX11: {{.*}} ; GFX11-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index 3145a27..60ac0b9 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -8905,4 +8905,4 @@ declare half @llvm.minnum.f16(half, half) #0 declare half @llvm.maxnum.f16(half, half) #0 attributes #0 = { nounwind readnone } -attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" } +attributes #2 = { nounwind "no-nans-fp-math"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll index d8bbda1..69d1ee3f 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll @@ -159,7 +159,7 @@ declare half @llvm.amdgcn.interp.p2.f16(float, float, i32, i32, i1, i32) #0 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #1 = { nounwind readnone } -attributes #2 = { nounwind "unsafe-fp-math"="true" } +attributes #2 = { nounwind } attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" } attributes #4 = { nounwind "amdgpu-ieee"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll index aaea4f7..b3202cb 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -8006,7 +8006,7 @@ declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #1 = { nounwind readnone } -attributes #2 = { nounwind "unsafe-fp-math"="true" } +attributes #2 = { nounwind } attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN-NSZ: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll index d41e2c6..8df7564 100644 --- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,GFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-TRUE16 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-FAKE16 %s define amdgpu_kernel void @fpext_f16_to_f32( ; SI-LABEL: fpext_f16_to_f32: diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll index a43292d..a043d53 100644 --- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s define amdgpu_kernel void @fptosi_f16_to_i16( diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll index 96cb621..af1ab37 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s define amdgpu_kernel void @fptoui_f16_to_i16( diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll index b0dd187..c28b25c7 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -599,10 +599,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10 ; SI-GISEL-NEXT: s_and_b32 s6, s6, 0xffe ; SI-GISEL-NEXT: s_or_b32 s4, s7, s4 -; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; SI-GISEL-NEXT: s_or_b32 s4, s6, s4 -; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0 ; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9 ; SI-GISEL-NEXT: s_lshl_b32 s7, s3, 12 @@ -711,10 +709,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe ; VI-GISEL-NEXT: s_or_b32 s2, s6, s2 -; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; VI-GISEL-NEXT: s_or_b32 s2, s5, s2 -; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4 ; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12 @@ -824,10 +820,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX9-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xffe ; GFX9-GISEL-NEXT: s_or_b32 s2, s6, s2 -; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; GFX9-GISEL-NEXT: s_or_b32 s2, s5, s2 -; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX9-GISEL-NEXT: s_sub_i32 s7, 1, s4 ; GFX9-GISEL-NEXT: s_lshl_b32 s6, s4, 12 @@ -937,10 +931,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX950-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; GFX950-GISEL-NEXT: s_and_b32 s5, s5, 0xffe ; GFX950-GISEL-NEXT: s_or_b32 s2, s6, s2 -; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; GFX950-GISEL-NEXT: s_or_b32 s2, s5, s2 -; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX950-GISEL-NEXT: s_sub_i32 s7, 1, s4 ; GFX950-GISEL-NEXT: s_lshl_b32 s6, s4, 12 @@ -1118,17 +1110,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s3, 8 -; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2 +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10 ; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe -; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 1, 0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s5, s2 -; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4 ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s8, s2, 0x1000 @@ -1175,17 +1165,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s3, 8 -; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2 +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10 ; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe -; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 1, 0 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s5, s2 -; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4 ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s8, s2, 0x1000 @@ -1366,17 +1354,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s3, 8 -; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2 +; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s4, 0xfc10 ; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe -; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 1, 0 -; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s5, s2 -; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s6, 1, s4 ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s8, s2, 0x1000 @@ -1423,17 +1409,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s3, 8 -; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2 +; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s4, 0xfc10 ; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe -; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 1, 0 -; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s5, s2 -; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s6, 1, s4 ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s8, s2, 0x1000 @@ -2154,10 +2138,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10 ; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe ; SI-GISEL-NEXT: s_or_b32 s4, s9, s4 -; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; SI-GISEL-NEXT: s_or_b32 s4, s8, s4 -; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0 ; SI-GISEL-NEXT: s_lshl_b32 s8, s8, 9 ; SI-GISEL-NEXT: s_lshl_b32 s9, s3, 12 @@ -2193,12 +2175,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; SI-GISEL-NEXT: s_and_b32 s4, s4, 0x8000 ; SI-GISEL-NEXT: s_addk_i32 s5, 0xfc10 ; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe -; SI-GISEL-NEXT: s_or_b32 s6, s9, s6 ; SI-GISEL-NEXT: s_or_b32 s3, s4, s3 -; SI-GISEL-NEXT: s_cmp_lg_u32 s6, 0 +; SI-GISEL-NEXT: s_or_b32 s4, s9, s6 ; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; SI-GISEL-NEXT: s_or_b32 s4, s8, s4 -; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0 ; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9 ; SI-GISEL-NEXT: s_lshl_b32 s8, s5, 12 @@ -2355,10 +2335,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; VI-GISEL-NEXT: s_addk_i32 s2, 0xfc10 ; VI-GISEL-NEXT: s_and_b32 s3, s3, 0xffe ; VI-GISEL-NEXT: s_or_b32 s4, s8, s4 -; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; VI-GISEL-NEXT: s_or_b32 s3, s3, s4 -; VI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; VI-GISEL-NEXT: s_sub_i32 s9, 1, s2 ; VI-GISEL-NEXT: s_lshl_b32 s8, s2, 12 @@ -2392,14 +2370,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; VI-GISEL-NEXT: s_or_b32 s2, s3, s2 ; VI-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014 ; VI-GISEL-NEXT: s_lshr_b32 s4, s7, 8 -; VI-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; VI-GISEL-NEXT: s_addk_i32 s3, 0xfc10 ; VI-GISEL-NEXT: s_and_b32 s4, s4, 0xffe +; VI-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; VI-GISEL-NEXT: s_or_b32 s5, s5, s6 -; VI-GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; VI-GISEL-NEXT: s_or_b32 s4, s4, s5 -; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; VI-GISEL-NEXT: s_sub_i32 s8, 1, s3 ; VI-GISEL-NEXT: s_lshl_b32 s6, s3, 12 @@ -2555,10 +2531,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX9-GISEL-NEXT: s_addk_i32 s2, 0xfc10 ; GFX9-GISEL-NEXT: s_and_b32 s3, s3, 0xffe ; GFX9-GISEL-NEXT: s_or_b32 s4, s8, s4 -; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GFX9-GISEL-NEXT: s_or_b32 s3, s3, s4 -; GFX9-GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GFX9-GISEL-NEXT: s_sub_i32 s9, 1, s2 ; GFX9-GISEL-NEXT: s_lshl_b32 s8, s2, 12 @@ -2592,14 +2566,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX9-GISEL-NEXT: s_or_b32 s2, s3, s2 ; GFX9-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014 ; GFX9-GISEL-NEXT: s_lshr_b32 s4, s7, 8 -; GFX9-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; GFX9-GISEL-NEXT: s_addk_i32 s3, 0xfc10 ; GFX9-GISEL-NEXT: s_and_b32 s4, s4, 0xffe +; GFX9-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; GFX9-GISEL-NEXT: s_or_b32 s5, s5, s6 -; GFX9-GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX9-GISEL-NEXT: s_or_b32 s4, s4, s5 -; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX9-GISEL-NEXT: s_sub_i32 s8, 1, s3 ; GFX9-GISEL-NEXT: s_lshl_b32 s6, s3, 12 @@ -2752,10 +2724,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX950-GISEL-NEXT: s_addk_i32 s2, 0xfc10 ; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0xffe ; GFX950-GISEL-NEXT: s_or_b32 s4, s8, s4 -; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GFX950-GISEL-NEXT: s_or_b32 s3, s3, s4 -; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GFX950-GISEL-NEXT: s_sub_i32 s9, 1, s2 ; GFX950-GISEL-NEXT: s_lshl_b32 s8, s2, 12 @@ -2789,14 +2759,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX950-GISEL-NEXT: s_or_b32 s2, s3, s2 ; GFX950-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014 ; GFX950-GISEL-NEXT: s_lshr_b32 s4, s7, 8 -; GFX950-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; GFX950-GISEL-NEXT: s_addk_i32 s3, 0xfc10 ; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0xffe +; GFX950-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; GFX950-GISEL-NEXT: s_or_b32 s5, s5, s6 -; GFX950-GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX950-GISEL-NEXT: s_or_b32 s4, s4, s5 -; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX950-GISEL-NEXT: s_sub_i32 s8, 1, s3 ; GFX950-GISEL-NEXT: s_lshl_b32 s6, s3, 12 @@ -3073,17 +3041,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s2, s5, 0xb0014 ; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 8 -; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4 +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s2, 0xfc10 ; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0xffe -; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s4 -; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 ; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s8, 1, s2 ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s10, s3, 0x1000 @@ -3115,19 +3081,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-GISEL-TRUE16-NEXT: s_cmpk_eq_i32 s2, 0x40f ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s4, s3 ; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 16 -; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s7, 0xb0014 ; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s7, 8 ; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6 +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10 ; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2 -; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, 0 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s8, s6 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 1, 0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s5, s3 -; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4 ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s9, s3, 0x1000 @@ -3176,17 +3140,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s2, s5, 0xb0014 ; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 8 -; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4 +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s2, 0xfc10 ; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0xffe -; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s4 -; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 ; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s8, 1, s2 ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s10, s3, 0x1000 @@ -3218,19 +3180,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-GISEL-FAKE16-NEXT: s_cmpk_eq_i32 s2, 0x40f ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s4, s3 ; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 16 -; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s7, 0xb0014 ; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s7, 8 ; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6 +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10 ; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2 -; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, 0 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s8, s6 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 1, 0 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s5, s3 -; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4 ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s9, s3, 0x1000 @@ -3511,17 +3471,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s2, s5, 0xb0014 ; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 8 -; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4 +; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s2, 0xfc10 ; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0xffe -; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 -; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s4 -; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s8, 1, s2 ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s10, s3, 0x1000 @@ -3553,19 +3511,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX1250-GISEL-TRUE16-NEXT: s_cmp_eq_u32 s2, 0x40f ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s4, s3 ; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 16 -; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s7, 0xb0014 ; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s7, 8 ; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6 +; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s4, 0xfc10 ; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2 -; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, 0 +; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s8, s6 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 1, 0 -; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s5, s3 -; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s6, 1, s4 ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s9, s3, 0x1000 @@ -3614,17 +3570,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s2, s5, 0xb0014 ; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 8 -; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4 +; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s2, 0xfc10 ; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0xffe -; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 -; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s4 -; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s8, 1, s2 ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s10, s3, 0x1000 @@ -3656,19 +3610,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX1250-GISEL-FAKE16-NEXT: s_cmp_eq_u32 s2, 0x40f ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s4, s3 ; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 16 -; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s7, 0xb0014 ; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s7, 8 ; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6 +; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s4, 0xfc10 ; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2 -; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, 0 +; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s8, s6 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 1, 0 -; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s5, s3 -; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s6, 1, s4 ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s9, s3, 0x1000 diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll index 5d31177..b6b26a4 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll @@ -2,14 +2,14 @@ ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-SDAG,VI-SAFE-SDAG %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-GISEL,VI-SAFE-GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI-SDAG,VI-UNSAFE-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-SDAG,VI-UNSAFE-SDAG %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-SAFE-SDAG %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-GISEL,GFX10-SAFE-GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-UNSAFE-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-UNSAFE-SDAG %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-SAFE-SDAG %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-SAFE-GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-TRUE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-FAKE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-FAKE16 %s @@ -182,7 +182,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; SI-NEXT: s_and_b32 s1, s7, 0x1ff ; SI-NEXT: s_and_b32 s8, s0, 0xffe ; SI-NEXT: s_or_b32 s0, s1, s6 -; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014 @@ -237,7 +236,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; VI-SDAG-NEXT: s_and_b32 s8, s4, 0xffe ; VI-SDAG-NEXT: s_and_b32 s4, s7, 0x1ff ; VI-SDAG-NEXT: s_or_b32 s4, s4, s6 -; VI-SDAG-NEXT: s_cmp_lg_u32 s4, 0 ; VI-SDAG-NEXT: s_mov_b32 s1, s5 ; VI-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0 ; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] @@ -290,10 +288,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe ; VI-GISEL-NEXT: s_or_b32 s2, s6, s2 -; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; VI-GISEL-NEXT: s_or_b32 s2, s5, s2 -; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4 ; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12 @@ -335,11 +331,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff -; GFX10-SDAG-NEXT: s_lshr_b32 s5, s3, 8 -; GFX10-SDAG-NEXT: s_or_b32 s2, s4, s2 -; GFX10-SDAG-NEXT: s_and_b32 s4, s5, 0xffe -; GFX10-SDAG-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10-SDAG-NEXT: s_lshr_b32 s4, s3, 8 +; GFX10-SDAG-NEXT: s_and_b32 s5, s3, 0x1ff +; GFX10-SDAG-NEXT: s_and_b32 s4, s4, 0xffe +; GFX10-SDAG-NEXT: s_or_b32 s2, s5, s2 ; GFX10-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX10-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014 @@ -387,16 +382,14 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX10-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX10-GISEL-NEXT: s_lshr_b32 s5, s3, 8 -; GFX10-GISEL-NEXT: s_or_b32 s2, s6, s2 +; GFX10-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX10-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; GFX10-GISEL-NEXT: s_and_b32 s5, s5, 0xffe -; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10-GISEL-NEXT: s_or_b32 s2, s6, s2 ; GFX10-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; GFX10-GISEL-NEXT: s_or_b32 s2, s5, s2 -; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX10-GISEL-NEXT: s_sub_i32 s6, 1, s4 ; GFX10-GISEL-NEXT: s_or_b32 s8, s2, 0x1000 @@ -438,11 +431,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff -; GFX11-SDAG-NEXT: s_lshr_b32 s5, s3, 8 -; GFX11-SDAG-NEXT: s_or_b32 s2, s4, s2 -; GFX11-SDAG-NEXT: s_and_b32 s4, s5, 0xffe -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-SDAG-NEXT: s_lshr_b32 s4, s3, 8 +; GFX11-SDAG-NEXT: s_and_b32 s5, s3, 0x1ff +; GFX11-SDAG-NEXT: s_and_b32 s4, s4, 0xffe +; GFX11-SDAG-NEXT: s_or_b32 s2, s5, s2 ; GFX11-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 @@ -498,17 +490,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX11-GISEL-NEXT: s_lshr_b32 s5, s3, 8 -; GFX11-GISEL-NEXT: s_or_b32 s2, s6, s2 +; GFX11-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; GFX11-GISEL-NEXT: s_and_b32 s5, s5, 0xffe -; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-GISEL-NEXT: s_or_b32 s2, s6, s2 ; GFX11-GISEL-NEXT: s_cselect_b32 s2, 1, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_or_b32 s2, s5, s2 -; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-GISEL-NEXT: s_sub_i32 s6, 1, s4 ; GFX11-GISEL-NEXT: s_or_b32 s8, s2, 0x1000 diff --git a/llvm/test/CodeGen/AMDGPU/fract.f64.ll b/llvm/test/CodeGen/AMDGPU/fract.f64.ll index f09c1c6..cc2e78d 100644 --- a/llvm/test/CodeGen/AMDGPU/fract.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fract.f64.ll @@ -2,8 +2,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -enable-unsafe-fp-math < %s | FileCheck --check-prefixes=GCN,SI,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck --check-prefixes=GCN,SI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s declare double @llvm.fabs.f64(double) #0 declare double @llvm.floor.f64(double) #0 diff --git a/llvm/test/CodeGen/AMDGPU/fract.ll b/llvm/test/CodeGen/AMDGPU/fract.ll index 8ef0fcf..723fd93 100644 --- a/llvm/test/CodeGen/AMDGPU/fract.ll +++ b/llvm/test/CodeGen/AMDGPU/fract.ll @@ -1,8 +1,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck --check-prefix=GCN %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefix=GCN %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -enable-unsafe-fp-math < %s | FileCheck --check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefix=GCN %s declare float @llvm.fabs.f32(float) #0 declare float @llvm.floor.f32(float) #0 diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 6f91222..d8cbdb1 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -2048,7 +2048,7 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2 ; GFX1200-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX1200-FAKE16-NEXT: s_endpgm - ptr addrspace(1) %in2) #1 { + ptr addrspace(1) %in2) #0 { %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 %r0 = load half, ptr addrspace(1) %in1, align 4 %r1 = load half, ptr addrspace(1) %gep2, align 4 @@ -3417,7 +3417,7 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; GFX1200-NEXT: v_fmac_f32_e32 v1, v3, v2 ; GFX1200-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1200-NEXT: s_endpgm - ptr addrspace(1) %in2) #1 { + ptr addrspace(1) %in2) #0 { %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 %r0 = load float, ptr addrspace(1) %in1, align 4 %r1 = load float, ptr addrspace(1) %gep2, align 4 @@ -4821,7 +4821,7 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX1200-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; GFX1200-NEXT: global_store_b64 v12, v[0:1], s[0:1] ; GFX1200-NEXT: s_endpgm - ptr addrspace(1) %in2) #1 { + ptr addrspace(1) %in2) #0 { %r0 = load double, ptr addrspace(1) %in1, align 8 %r1 = load double, ptr addrspace(1) %in2, align 8 %r2 = frem afn double %r0, %r1 @@ -18918,7 +18918,4 @@ define amdgpu_kernel void @frem_v2f64_const(ptr addrspace(1) %out) #0 { -attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } -attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } - - +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll index 1b74ddf..9b97981 100644 --- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll @@ -2870,7 +2870,7 @@ define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 { ret double %result } -define double @v_sqrt_f64__unsafe_attr(double %x) #4 { +define double @v_sqrt_f64__unsafe_attr(double %x) { ; GFX6-SDAG-LABEL: v_sqrt_f64__unsafe_attr: ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3449,7 +3449,6 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) #1 attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } attributes #1 = { convergent nounwind willreturn memory(none) } attributes #3 = { "no-nans-fp-math"="true" "no-infs-fp-math"="true" } -attributes #4 = { "unsafe-fp-math"="true" } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GFX6: {{.*}} ; GFX8: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll index 9f19bcb..c93c077 100644 --- a/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll +++ b/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll @@ -239,4 +239,4 @@ declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) #0 declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) #0 attributes #0 = { nounwind readnone } -attributes #1 = { nounwind "unsafe-fp-math"="true" } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 37756d1..31f277f 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -472,7 +472,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -536,11 +535,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -606,7 +604,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -660,12 +657,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -710,9 +706,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -1690,7 +1685,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -1754,11 +1748,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1824,7 +1817,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -1878,12 +1870,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1928,9 +1919,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -2968,7 +2958,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -3032,11 +3021,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3102,7 +3090,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -3156,12 +3143,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3206,9 +3192,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -3742,7 +3727,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -3806,11 +3790,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3876,7 +3859,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -3930,12 +3912,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3980,9 +3961,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -5019,7 +4999,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -5083,11 +5062,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5153,7 +5131,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -5207,12 +5184,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5270,9 +5246,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -6284,7 +6259,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6354,7 +6328,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6424,7 +6397,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6485,8 +6457,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6550,7 +6520,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7717,7 +7686,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7787,7 +7755,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7857,7 +7824,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7918,8 +7884,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7983,7 +7947,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9150,7 +9113,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9220,7 +9182,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9290,7 +9251,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9351,8 +9311,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9416,7 +9374,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10065,7 +10022,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10135,7 +10091,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10205,7 +10160,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10266,8 +10220,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10331,7 +10283,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11498,7 +11449,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11568,7 +11518,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11638,7 +11587,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11699,8 +11647,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11764,7 +11710,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 6351bb3..4581efc 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -381,13 +381,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-NEXT: .LBB1_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: v_readlane_b32 s3, v0, s2 +; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v1 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -457,7 +456,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -513,7 +511,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 @@ -562,8 +559,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -610,11 +606,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -1420,13 +1414,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-NEXT: .LBB3_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: v_readlane_b32 s3, v0, s2 +; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v1 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1496,7 +1489,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -1552,7 +1544,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 @@ -1601,8 +1592,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -1649,11 +1639,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -2459,13 +2447,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-NEXT: .LBB5_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: v_readlane_b32 s3, v0, s2 +; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v1 ; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2535,7 +2522,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -2591,7 +2577,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 @@ -2640,8 +2625,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -2688,11 +2672,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -3591,7 +3573,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -3665,7 +3646,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -3724,7 +3704,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -3774,8 +3753,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -3841,10 +3819,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -4859,7 +4836,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -4933,7 +4909,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -4992,7 +4967,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -5042,8 +5016,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -5109,10 +5082,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -6127,7 +6099,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -6201,7 +6172,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -6260,7 +6230,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -6310,8 +6279,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -6377,10 +6345,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index a9ac008..bd570d9 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -381,13 +381,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-NEXT: .LBB1_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: v_readlane_b32 s3, v0, s2 +; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX9-NEXT: v_min_f32_e32 v2, v2, v1 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -457,7 +456,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -513,7 +511,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 @@ -562,8 +559,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -610,11 +606,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -1420,13 +1414,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-NEXT: .LBB3_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: v_readlane_b32 s3, v0, s2 +; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX9-NEXT: v_min_f32_e32 v2, v2, v1 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1496,7 +1489,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -1552,7 +1544,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 @@ -1601,8 +1592,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -1649,11 +1639,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -2459,13 +2447,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-NEXT: .LBB5_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: v_readlane_b32 s3, v0, s2 +; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX9-NEXT: v_min_f32_e32 v2, v2, v1 ; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2535,7 +2522,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -2591,7 +2577,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 @@ -2640,8 +2625,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -2688,11 +2672,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -3591,7 +3573,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -3665,7 +3646,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -3724,7 +3704,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -3774,8 +3753,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -3841,10 +3819,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -4859,7 +4836,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -4933,7 +4909,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -4992,7 +4967,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -5042,8 +5016,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -5109,10 +5082,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -6127,7 +6099,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -6201,7 +6172,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -6260,7 +6230,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -6310,8 +6279,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -6377,10 +6345,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index 6311143..1f2d70c 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -532,7 +532,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -596,11 +595,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -666,7 +664,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -720,12 +717,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -783,9 +779,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -1862,7 +1857,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -1926,11 +1920,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1996,7 +1989,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -2050,12 +2042,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2113,9 +2104,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -3192,7 +3182,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -3256,11 +3245,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3326,7 +3314,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -3380,12 +3367,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3443,9 +3429,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -4018,7 +4003,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -4082,11 +4066,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4152,7 +4135,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -4206,12 +4188,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4269,9 +4250,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -5347,7 +5327,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -5411,11 +5390,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5481,7 +5459,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -5535,12 +5512,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5598,9 +5574,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -6612,7 +6587,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6682,7 +6656,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6752,7 +6725,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6813,8 +6785,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6878,7 +6848,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8044,7 +8013,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8114,7 +8082,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8184,7 +8151,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8245,8 +8211,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8310,7 +8274,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9477,7 +9440,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9547,7 +9509,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9617,7 +9578,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9678,8 +9638,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9743,7 +9701,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10392,7 +10349,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10462,7 +10418,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10532,7 +10487,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10593,8 +10547,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10658,7 +10610,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11824,7 +11775,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11894,7 +11844,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11964,7 +11913,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12025,8 +11973,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12090,7 +12036,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/inline-attr.ll b/llvm/test/CodeGen/AMDGPU/inline-attr.ll index 4ae0ba0..c33b3344 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-attr.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-attr.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 -; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 -enable-unsafe-fp-math %s | FileCheck --check-prefixes=GCN,UNSAFE %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 %s | FileCheck --check-prefixes=GCN,UNSAFE %s ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 -enable-no-nans-fp-math %s | FileCheck --check-prefixes=GCN,NONANS %s ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 -enable-no-infs-fp-math %s | FileCheck --check-prefixes=GCN,NOINFS %s @@ -36,18 +36,18 @@ entry: ret void } -attributes #0 = { nounwind "uniform-work-group-size"="false" "unsafe-fp-math"="true"} -attributes #1 = { nounwind "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" } +attributes #0 = { nounwind "uniform-work-group-size"="false"} +attributes #1 = { nounwind "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" } ;. -; UNSAFE: attributes #[[ATTR0]] = { nounwind "uniform-work-group-size"="false" "unsafe-fp-math"="true" } -; UNSAFE: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } +; UNSAFE: attributes #[[ATTR0]] = { nounwind "uniform-work-group-size"="false" } +; UNSAFE: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "uniform-work-group-size"="false" } ;. -; NONANS: attributes #[[ATTR0]] = { nounwind "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } -; NONANS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } +; NONANS: attributes #[[ATTR0]] = { nounwind "no-nans-fp-math"="true" "uniform-work-group-size"="false" } +; NONANS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "uniform-work-group-size"="false" } ;. -; NOINFS: attributes #[[ATTR0]] = { nounwind "no-infs-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } -; NOINFS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } +; NOINFS: attributes #[[ATTR0]] = { nounwind "no-infs-fp-math"="true" "uniform-work-group-size"="false" } +; NOINFS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "uniform-work-group-size"="false" } ;. ; UNSAFE: [[META0]] = !{} ;. diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index eee232a..c3f3917 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -136,19 +136,17 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: .LBB2_6: ; %bb18 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: v_readfirstlane_b32 s13, v0 -; GFX11-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-NEXT: s_cselect_b32 s1, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 -; GFX11-NEXT: s_and_b32 s1, s8, s1 -; GFX11-NEXT: s_and_b32 s1, s1, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_cselect_b32 s13, -1, 0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13 +; GFX11-NEXT: s_and_b32 s13, s8, s13 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_and_b32 s13, s13, exec_lo ; GFX11-NEXT: v_readfirstlane_b32 s19, v2 -; GFX11-NEXT: s_cselect_b32 s1, s19, s13 -; GFX11-NEXT: s_and_b32 s13, 0xffff, s0 +; GFX11-NEXT: s_cselect_b32 s1, s19, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s1, s1, 1 -; GFX11-NEXT: s_cmp_lg_u32 s13, 0 +; GFX11-NEXT: s_and_b32 s13, 0xffff, s0 ; GFX11-NEXT: s_cselect_b32 s13, -1, 0 ; GFX11-NEXT: s_and_b32 s20, s9, exec_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.add.min.max.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.add.min.max.ll new file mode 100644 index 0000000..99421d4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.add.min.max.ll @@ -0,0 +1,191 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250-GISEL %s + +declare i32 @llvm.amdgcn.add.min.i32(i32, i32, i32, i1) +declare i32 @llvm.amdgcn.add.max.i32(i32, i32, i32, i1) +declare i32 @llvm.amdgcn.add.min.u32(i32, i32, i32, i1) +declare i32 @llvm.amdgcn.add.max.u32(i32, i32, i32, i1) +declare <2 x i16> @llvm.amdgcn.pk.add.min.i16(<2 x i16>, <2 x i16>, <2 x i16>, i1) +declare <2 x i16> @llvm.amdgcn.pk.add.max.i16(<2 x i16>, <2 x i16>, <2 x i16>, i1) +declare <2 x i16> @llvm.amdgcn.pk.add.min.u16(<2 x i16>, <2 x i16>, <2 x i16>, i1) +declare <2 x i16> @llvm.amdgcn.pk.add.max.u16(<2 x i16>, <2 x i16>, <2 x i16>, i1) + +define i32 @test_add_min_i32_vvv(i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: test_add_min_i32_vvv: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_add_min_i32 v0, v0, v1, v2 +; GCN-NEXT: s_set_pc_i64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.add.min.i32(i32 %a, i32 %b, i32 %c, i1 0) + ret i32 %ret +} + +define i32 @test_add_min_i32_ssi_clamp(i32 inreg %a, i32 inreg %b) { +; GCN-LABEL: test_add_min_i32_ssi_clamp: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_add_min_i32 v0, s0, s1, 1 clamp +; GCN-NEXT: s_set_pc_i64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.add.min.i32(i32 %a, i32 %b, i32 1, i1 1) + ret i32 %ret +} + +define i32 @test_add_min_u32_vvv(i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: test_add_min_u32_vvv: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_add_min_u32 v0, v0, v1, v2 +; GCN-NEXT: s_set_pc_i64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.add.min.u32(i32 %a, i32 %b, i32 %c, i1 0) + ret i32 %ret +} + +define i32 @test_add_min_u32_ssi_clamp(i32 inreg %a, i32 inreg %b) { +; GCN-LABEL: test_add_min_u32_ssi_clamp: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_add_min_u32 v0, s0, s1, 1 clamp +; GCN-NEXT: s_set_pc_i64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.add.min.u32(i32 %a, i32 %b, i32 1, i1 1) + ret i32 %ret +} + +define i32 @test_add_max_i32_vvv(i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: test_add_max_i32_vvv: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_add_max_i32 v0, v0, v1, v2 +; GCN-NEXT: s_set_pc_i64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.add.max.i32(i32 %a, i32 %b, i32 %c, i1 0) + ret i32 %ret +} + +define i32 @test_add_max_i32_ssi_clamp(i32 inreg %a, i32 inreg %b) { +; GCN-LABEL: test_add_max_i32_ssi_clamp: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_add_max_i32 v0, s0, s1, 1 clamp +; GCN-NEXT: s_set_pc_i64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.add.max.i32(i32 %a, i32 %b, i32 1, i1 1) + ret i32 %ret +} + +define i32 @test_add_max_u32_vvv(i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: test_add_max_u32_vvv: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_add_max_u32 v0, v0, v1, v2 +; GCN-NEXT: s_set_pc_i64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.add.max.u32(i32 %a, i32 %b, i32 %c, i1 0) + ret i32 %ret +} + +define i32 @test_add_max_u32_ssi_clamp(i32 inreg %a, i32 inreg %b) { +; GCN-LABEL: test_add_max_u32_ssi_clamp: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_add_max_u32 v0, s0, s1, 1 clamp +; GCN-NEXT: s_set_pc_i64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.add.max.u32(i32 %a, i32 %b, i32 1, i1 1) + ret i32 %ret +} + +define <2 x i16> @test_add_min_i16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) { +; GCN-LABEL: test_add_min_i16_vvv: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_pk_add_min_i16 v0, v0, v1, v2 +; GCN-NEXT: s_set_pc_i64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0) + ret <2 x i16> %ret +} + +define <2 x i16> @test_add_min_i16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) { +; GCN-LABEL: test_add_min_i16_ssi_clamp: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_pk_add_min_i16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp +; GCN-NEXT: s_set_pc_i64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1) + ret <2 x i16> %ret +} + +define <2 x i16> @test_add_min_u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) { +; GCN-LABEL: test_add_min_u16_vvv: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_pk_add_min_u16 v0, v0, v1, v2 +; GCN-NEXT: s_set_pc_i64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0) + ret <2 x i16> %ret +} + +define <2 x i16> @test_add_min_u16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) { +; GCN-LABEL: test_add_min_u16_ssi_clamp: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_pk_add_min_u16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp +; GCN-NEXT: s_set_pc_i64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1) + ret <2 x i16> %ret +} + +define <2 x i16> @test_add_max_i16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) { +; GCN-LABEL: test_add_max_i16_vvv: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_pk_add_max_i16 v0, v0, v1, v2 +; GCN-NEXT: s_set_pc_i64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0) + ret <2 x i16> %ret +} + +define <2 x i16> @test_add_max_i16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) { +; GCN-LABEL: test_add_max_i16_ssi_clamp: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_pk_add_max_i16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp +; GCN-NEXT: s_set_pc_i64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1) + ret <2 x i16> %ret +} + +define <2 x i16> @test_add_max_u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) { +; GCN-LABEL: test_add_max_u16_vvv: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_pk_add_max_u16 v0, v0, v1, v2 +; GCN-NEXT: s_set_pc_i64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0) + ret <2 x i16> %ret +} + +define <2 x i16> @test_add_max_u16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) { +; GCN-LABEL: test_add_max_u16_ssi_clamp: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_pk_add_max_u16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp +; GCN-NEXT: s_set_pc_i64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1) + ret <2 x i16> %ret +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX1250-GISEL: {{.*}} +; GFX1250-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index 883db20..e30a586 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -1485,7 +1485,7 @@ define float @v_exp2_f32_fast(float %in) { ret float %result } -define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { +define float @v_exp2_f32_unsafe_math_attr(float %in) { ; SI-SDAG-LABEL: v_exp2_f32_unsafe_math_attr: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index 0854134..61a777f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -1907,7 +1907,7 @@ define float @v_log2_f32_fast(float %in) { ret float %result } -define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { +define float @v_log2_f32_unsafe_math_attr(float %in) { ; SI-SDAG-LABEL: v_log2_f32_unsafe_math_attr: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 8748aff..6dc9199 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -8265,12 +8265,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_readlane_b32 s6, v1, s3 -; GFX12-NEXT: s_lshl_b32 s7, 1, s3 ; GFX12-NEXT: v_writelane_b32 v0, s0, s3 +; GFX12-NEXT: s_lshl_b32 s3, 1, s3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 s1, s1, s7 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12-NEXT: s_and_not1_b32 s1, s1, s3 ; GFX12-NEXT: s_add_f32 s0, s0, s6 ; GFX12-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX12-NEXT: ; %bb.6: ; %ComputeEnd @@ -8351,14 +8349,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX942-NEXT: .LBB28_5: ; %ComputeLoop ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX942-NEXT: v_readfirstlane_b32 s8, v1 -; GFX942-NEXT: v_readlane_b32 s9, v2, s3 +; GFX942-NEXT: v_readfirstlane_b32 s6, v1 ; GFX942-NEXT: s_mov_b32 m0, s3 +; GFX942-NEXT: v_readlane_b32 s8, v2, s3 +; GFX942-NEXT: v_writelane_b32 v0, s6, m0 +; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX942-NEXT: v_writelane_b32 v0, s8, m0 -; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX942-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX942-NEXT: v_add_f32_e32 v1, s8, v1 ; GFX942-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX942-NEXT: ; %bb.6: ; %ComputeEnd ; GFX942-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8440,15 +8437,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: .LBB28_5: ; %ComputeLoop ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_ctz_i32_b32 s1, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readlane_b32 s6, v2, s1 -; GFX11-NEXT: s_lshl_b32 s7, 1, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 s0, s0, s7 ; GFX11-NEXT: v_writelane_b32 v0, s3, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_lshl_b32 s1, 1, s1 +; GFX11-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX11-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX11-NEXT: ; %bb.6: ; %ComputeEnd ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8528,11 +8524,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10-NEXT: s_ff1_i32_b32 s1, s0 ; GFX10-NEXT: v_readfirstlane_b32 s3, v1 ; GFX10-NEXT: v_readlane_b32 s6, v2, s1 -; GFX10-NEXT: s_lshl_b32 s7, 1, s1 -; GFX10-NEXT: s_andn2_b32 s0, s0, s7 ; GFX10-NEXT: v_writelane_b32 v0, s3, s1 ; GFX10-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_lshl_b32 s1, 1, s1 +; GFX10-NEXT: s_andn2_b32 s0, s0, s1 ; GFX10-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX10-NEXT: ; %bb.6: ; %ComputeEnd ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8609,14 +8604,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: .LBB28_5: ; %ComputeLoop ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v1 -; GFX90A-NEXT: v_readlane_b32 s9, v2, s3 +; GFX90A-NEXT: v_readfirstlane_b32 s6, v1 ; GFX90A-NEXT: s_mov_b32 m0, s3 +; GFX90A-NEXT: v_readlane_b32 s8, v2, s3 +; GFX90A-NEXT: v_writelane_b32 v0, s6, m0 +; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX90A-NEXT: v_writelane_b32 v0, s8, m0 -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX90A-NEXT: v_add_f32_e32 v1, s8, v1 ; GFX90A-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8692,14 +8686,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: .LBB28_5: ; %ComputeLoop ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX908-NEXT: v_readfirstlane_b32 s8, v1 -; GFX908-NEXT: v_readlane_b32 s9, v2, s3 +; GFX908-NEXT: v_readfirstlane_b32 s6, v1 ; GFX908-NEXT: s_mov_b32 m0, s3 +; GFX908-NEXT: v_readlane_b32 s8, v2, s3 +; GFX908-NEXT: v_writelane_b32 v0, s6, m0 +; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX908-NEXT: v_writelane_b32 v0, s8, m0 -; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX908-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX908-NEXT: v_add_f32_e32 v1, s8, v1 ; GFX908-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX908-NEXT: ; %bb.6: ; %ComputeEnd ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8776,14 +8769,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: .LBB28_5: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX8-NEXT: v_readfirstlane_b32 s8, v1 -; GFX8-NEXT: v_readlane_b32 s9, v2, s3 +; GFX8-NEXT: v_readfirstlane_b32 s6, v1 ; GFX8-NEXT: s_mov_b32 m0, s3 +; GFX8-NEXT: v_readlane_b32 s8, v2, s3 +; GFX8-NEXT: v_writelane_b32 v0, s6, m0 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: v_writelane_b32 v0, s8, m0 -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX8-NEXT: v_add_f32_e32 v1, s8, v1 ; GFX8-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX8-NEXT: ; %bb.6: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9130,12 +9122,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_readlane_b32 s6, v1, s3 -; GFX12-NEXT: s_lshl_b32 s7, 1, s3 ; GFX12-NEXT: v_writelane_b32 v0, s0, s3 +; GFX12-NEXT: s_lshl_b32 s3, 1, s3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 s1, s1, s7 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12-NEXT: s_and_not1_b32 s1, s1, s3 ; GFX12-NEXT: s_add_f32 s0, s0, s6 ; GFX12-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX12-NEXT: ; %bb.6: ; %ComputeEnd @@ -9212,14 +9202,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX942-NEXT: .LBB29_5: ; %ComputeLoop ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX942-NEXT: v_readfirstlane_b32 s8, v1 -; GFX942-NEXT: v_readlane_b32 s9, v2, s3 +; GFX942-NEXT: v_readfirstlane_b32 s6, v1 ; GFX942-NEXT: s_mov_b32 m0, s3 +; GFX942-NEXT: v_readlane_b32 s8, v2, s3 +; GFX942-NEXT: v_writelane_b32 v0, s6, m0 +; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX942-NEXT: v_writelane_b32 v0, s8, m0 -; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX942-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX942-NEXT: v_add_f32_e32 v1, s8, v1 ; GFX942-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX942-NEXT: ; %bb.6: ; %ComputeEnd ; GFX942-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9296,15 +9285,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: .LBB29_5: ; %ComputeLoop ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_ctz_i32_b32 s1, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readlane_b32 s6, v2, s1 -; GFX11-NEXT: s_lshl_b32 s7, 1, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 s0, s0, s7 ; GFX11-NEXT: v_writelane_b32 v0, s3, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_lshl_b32 s1, 1, s1 +; GFX11-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX11-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX11-NEXT: ; %bb.6: ; %ComputeEnd ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9377,11 +9365,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: s_ff1_i32_b32 s1, s0 ; GFX10-NEXT: v_readfirstlane_b32 s3, v1 ; GFX10-NEXT: v_readlane_b32 s6, v2, s1 -; GFX10-NEXT: s_lshl_b32 s7, 1, s1 -; GFX10-NEXT: s_andn2_b32 s0, s0, s7 ; GFX10-NEXT: v_writelane_b32 v0, s3, s1 ; GFX10-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_lshl_b32 s1, 1, s1 +; GFX10-NEXT: s_andn2_b32 s0, s0, s1 ; GFX10-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX10-NEXT: ; %bb.6: ; %ComputeEnd ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9453,14 +9440,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: .LBB29_5: ; %ComputeLoop ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v1 -; GFX90A-NEXT: v_readlane_b32 s9, v2, s3 +; GFX90A-NEXT: v_readfirstlane_b32 s6, v1 ; GFX90A-NEXT: s_mov_b32 m0, s3 +; GFX90A-NEXT: v_readlane_b32 s8, v2, s3 +; GFX90A-NEXT: v_writelane_b32 v0, s6, m0 +; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX90A-NEXT: v_writelane_b32 v0, s8, m0 -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX90A-NEXT: v_add_f32_e32 v1, s8, v1 ; GFX90A-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9533,14 +9519,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: .LBB29_5: ; %ComputeLoop ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX908-NEXT: v_readfirstlane_b32 s8, v1 -; GFX908-NEXT: v_readlane_b32 s9, v2, s3 +; GFX908-NEXT: v_readfirstlane_b32 s6, v1 ; GFX908-NEXT: s_mov_b32 m0, s3 +; GFX908-NEXT: v_readlane_b32 s8, v2, s3 +; GFX908-NEXT: v_writelane_b32 v0, s6, m0 +; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX908-NEXT: v_writelane_b32 v0, s8, m0 -; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX908-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX908-NEXT: v_add_f32_e32 v1, s8, v1 ; GFX908-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX908-NEXT: ; %bb.6: ; %ComputeEnd ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9614,14 +9599,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: .LBB29_5: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX8-NEXT: v_readfirstlane_b32 s8, v1 -; GFX8-NEXT: v_readlane_b32 s9, v2, s3 +; GFX8-NEXT: v_readfirstlane_b32 s6, v1 ; GFX8-NEXT: s_mov_b32 m0, s3 +; GFX8-NEXT: v_readlane_b32 s8, v2, s3 +; GFX8-NEXT: v_writelane_b32 v0, s6, m0 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: v_writelane_b32 v0, s8, m0 -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX8-NEXT: v_add_f32_e32 v1, s8, v1 ; GFX8-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX8-NEXT: ; %bb.6: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll index d578d2e..60570bd 100644 --- a/llvm/test/CodeGen/AMDGPU/minmax.ll +++ b/llvm/test/CodeGen/AMDGPU/minmax.ll @@ -1296,4 +1296,4 @@ declare half @llvm.minnum.f16(half, half) declare half @llvm.maxnum.f16(half, half) declare float @llvm.minnum.f32(float, float) declare float @llvm.maxnum.f32(float, float) -attributes #0 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" } +attributes #0 = { nounwind "no-nans-fp-math"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir index c1cf06e..fba42c4 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir +++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir @@ -388,9 +388,8 @@ body: | ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY]], implicit-def $scc - ; GCN-NEXT: S_NOP 0, implicit killed $scc - ; GCN-NEXT: S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc + ; GCN-NEXT: S_BITCMP1_B32 killed [[COPY]], 0, implicit-def $scc + ; GCN-NEXT: S_NOP 0, implicit $scc ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} @@ -417,6 +416,80 @@ body: | S_ENDPGM 0 ... +--- +name: xor_1_cmp_lg_0_killed_scc +body: | + ; GCN-LABEL: name: xor_1_cmp_lg_0_killed_scc + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GCN-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 1, killed [[COPY]], implicit-def $scc + ; GCN-NEXT: S_NOP 0, implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: $sgpr0, $vgpr0_vgpr1 + + %0:sreg_32 = COPY $sgpr0 + %1:sreg_32 = S_XOR_B32 1, killed %0, implicit-def $scc + S_NOP 0, implicit killed $scc + S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit $scc + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2(0x80000000) + + bb.2: + S_ENDPGM 0 + +... +--- +name: absdiff_1_cmp_lg_0_killed_scc +body: | + ; GCN-LABEL: name: absdiff_1_cmp_lg_0_killed_scc + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GCN-NEXT: [[S_ABSDIFF_I32_:%[0-9]+]]:sreg_32 = S_ABSDIFF_I32 1, killed [[COPY]], implicit-def $scc + ; GCN-NEXT: S_NOP 0, implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: $sgpr0, $vgpr0_vgpr1 + + %0:sreg_32 = COPY $sgpr0 + %1:sreg_32 = S_ABSDIFF_I32 1, killed %0, implicit-def $scc + S_NOP 0, implicit killed $scc + S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit $scc + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2(0x80000000) + + bb.2: + S_ENDPGM 0 + +... --- name: and_1_cmp_eq_1_clobbered_scc @@ -2070,8 +2143,7 @@ body: | ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def dead $scc - ; GCN-NEXT: S_CMP_LG_U32 killed [[S_AND_B32_]], 0, implicit-def $scc + ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def $scc ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll b/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll index ef3e04c..6ce614b 100644 --- a/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -passes='simplifycfg<hoist-common-insts>' -mtriple=amdgcn-- --fp-contract=fast -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX -check-prefix=FP-CONTRACT-FAST %s -; RUN: opt -S -passes='simplifycfg<hoist-common-insts>' -mtriple=amdgcn-- --fp-contract=off --enable-unsafe-fp-math -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX -check-prefix=UNSAFE-FP-MATH %s +; RUN: opt -S -passes='simplifycfg<hoist-common-insts>' -mtriple=amdgcn-- --fp-contract=off -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX -check-prefix=UNSAFE-FP-MATH %s ; RUN: opt -S -passes='simplifycfg<hoist-common-insts>' -mtriple=amdgcn-- --fp-contract=off -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX -check-prefix=NO-UNSAFE-FP-MATH %s define double @is_profitable_f64_contract(ptr dereferenceable(8) %ptr_x, ptr dereferenceable(8) %ptr_y, ptr dereferenceable(8) %ptr_a) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll index f53aaaa..dd5f838 100644 --- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s declare i32 @llvm.ctpop.i32(i32) declare i64 @llvm.ctpop.i64(i64) @@ -10,7 +10,6 @@ define amdgpu_ps i32 @shl32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: shl32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_lshl_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -25,7 +24,6 @@ define amdgpu_ps i32 @shl64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: shl64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -40,7 +38,6 @@ define amdgpu_ps i32 @lshr32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: lshr32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_lshr_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -55,7 +52,6 @@ define amdgpu_ps i32 @lshr64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: lshr64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -70,7 +66,6 @@ define amdgpu_ps i32 @ashr32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: ashr32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_ashr_i32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -85,7 +80,6 @@ define amdgpu_ps i32 @ashr64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: ashr64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_ashr_i64 s[0:1], s[0:1], s2 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -100,7 +94,6 @@ define amdgpu_ps i32 @abs32(i32 inreg %val0) { ; CHECK-LABEL: abs32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_abs_i32 s0, s0 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -121,7 +114,6 @@ define amdgpu_ps i32 @and32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: and32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_and_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -136,7 +128,6 @@ define amdgpu_ps i32 @and64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: and64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -151,7 +142,6 @@ define amdgpu_ps i32 @or32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: or32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_or_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -166,7 +156,6 @@ define amdgpu_ps i32 @or64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: or64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -181,7 +170,6 @@ define amdgpu_ps i32 @xor32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: xor32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_xor_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -196,7 +184,6 @@ define amdgpu_ps i32 @xor64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: xor64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -211,7 +198,6 @@ define amdgpu_ps i32 @nand32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: nand32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_nand_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -231,7 +217,6 @@ define amdgpu_ps i32 @nand64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: nand64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -251,7 +236,6 @@ define amdgpu_ps i32 @nor32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: nor32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_nor_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -271,7 +255,6 @@ define amdgpu_ps i32 @nor64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: nor64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -291,7 +274,6 @@ define amdgpu_ps i32 @xnor32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: xnor32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_xnor_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -311,7 +293,6 @@ define amdgpu_ps i32 @xnor64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: xnor64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_xnor_b64 s[0:1], s[0:1], s[2:3] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -331,7 +312,6 @@ define amdgpu_ps i32 @andn232(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: andn232: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_andn2_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -347,7 +327,6 @@ define amdgpu_ps i32 @nandn264(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: nandn264: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -363,7 +342,6 @@ define amdgpu_ps i32 @orn232(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: orn232: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_orn2_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -379,7 +357,6 @@ define amdgpu_ps i32 @orn264(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: orn264: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_orn2_b64 s[0:1], s[0:1], s[2:3] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -395,7 +372,6 @@ define amdgpu_ps i32 @bfe_i32(i32 inreg %val0) { ; CHECK-LABEL: bfe_i32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_bfe_i32 s0, s0, 0x80010 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -433,7 +409,6 @@ define amdgpu_ps i32 @bfe_u32(i32 inreg %val0) { ; CHECK-LABEL: bfe_u32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_bfe_u32 s0, s0, 0x80010 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -513,7 +488,6 @@ define amdgpu_ps i32 @bcnt132(i32 inreg %val0) { ; CHECK-LABEL: bcnt132: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -552,7 +526,6 @@ define amdgpu_ps i32 @quadmask32(i32 inreg %val0) { ; CHECK-LABEL: quadmask32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_quadmask_b32 s0, s0 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -571,7 +544,6 @@ define amdgpu_ps i32 @quadmask64(i64 inreg %val0) { ; CHECK-LABEL: quadmask64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_quadmask_b64 s[0:1], s[0:1] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -590,7 +562,6 @@ define amdgpu_ps i32 @not32(i32 inreg %val0) { ; CHECK-LABEL: not32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_not_b32 s0, s0 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -609,7 +580,6 @@ define amdgpu_ps i32 @not64(i64 inreg %val0) { ; CHECK-LABEL: not64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_not_b64 s[0:1], s[0:1] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -623,3 +593,35 @@ define amdgpu_ps i32 @not64(i64 inreg %val0) { %zext = zext i1 %cmp to i32 ret i32 %zext } + + +; -------------------------------------------------------------------------------- +; Negative tests +; -------------------------------------------------------------------------------- + +@1 = extern_weak dso_local addrspace(4) constant i32 + +define amdgpu_ps i32 @si_pc_add_rel_offset_must_not_optimize() { +; CHECK-LABEL: si_pc_add_rel_offset_must_not_optimize: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_getpc_b64 s[0:1] +; CHECK-NEXT: s_add_u32 s0, s0, __unnamed_1@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s1, s1, __unnamed_1@rel32@hi+12 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: s_cbranch_scc0 .LBB35_2 +; CHECK-NEXT: ; %bb.1: ; %endif +; CHECK-NEXT: s_mov_b32 s0, 1 +; CHECK-NEXT: s_branch .LBB35_3 +; CHECK-NEXT: .LBB35_2: ; %if +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_branch .LBB35_3 +; CHECK-NEXT: .LBB35_3: + %cmp = icmp ne ptr addrspace(4) @1, null + br i1 %cmp, label %endif, label %if + +if: + ret i32 0 + +endif: + ret i32 1 +} diff --git a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll index a828ee0..7552f6b 100644 --- a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll +++ b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll @@ -12,8 +12,6 @@ define amdgpu_ps i32 @s_uaddo_pseudo(i32 inreg %val0) { ; CHECK-LABEL: s_uaddo_pseudo: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s0, s0, 1 -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_addc_u32 s0, 1, 0 ; CHECK-NEXT: ; return to shader part epilog %pair = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %val0, i32 1) @@ -32,8 +30,6 @@ define amdgpu_ps i32 @s_usubo_pseudo(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: s_usubo_pseudo: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_sub_u32 s0, s0, 1 -; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 ; CHECK-NEXT: s_subb_u32 s0, s1, 0 ; CHECK-NEXT: ; return to shader part epilog %pair = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %val0, i32 1) diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index 5f6d622..71f5a94 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -56,10 +56,9 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_addc_u32 s15, 0, s16 ; GCN-NEXT: s_add_u32 s16, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: v_mul_hi_u32 v0, s12, v0 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s14, s14, s15 ; GCN-NEXT: s_mul_i32 s0, s12, s14 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 @@ -90,7 +89,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_add_u32 s15, s16, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s14, s14, s12 ; GCN-NEXT: s_ashr_i32 s12, s7, 31 ; GCN-NEXT: s_add_u32 s0, s6, s12 @@ -116,52 +114,50 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-NEXT: s_addc_u32 s4, s4, 0 ; GCN-NEXT: s_mul_i32 s14, s7, s14 -; GCN-NEXT: s_add_u32 s14, s1, s14 -; GCN-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NEXT: s_add_u32 s16, s1, s14 +; GCN-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 -; GCN-NEXT: s_addc_u32 s15, 0, s4 +; GCN-NEXT: s_addc_u32 s17, 0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_mul_i32 s4, s10, s15 +; GCN-NEXT: s_mul_i32 s4, s10, s17 ; GCN-NEXT: v_readfirstlane_b32 s5, v0 ; GCN-NEXT: s_add_i32 s4, s5, s4 -; GCN-NEXT: s_mul_i32 s5, s11, s14 -; GCN-NEXT: s_add_i32 s16, s4, s5 -; GCN-NEXT: s_sub_i32 s17, s7, s16 -; GCN-NEXT: s_mul_i32 s4, s10, s14 +; GCN-NEXT: s_mul_i32 s5, s11, s16 +; GCN-NEXT: s_add_i32 s18, s4, s5 +; GCN-NEXT: s_sub_i32 s14, s7, s18 +; GCN-NEXT: s_mul_i32 s4, s10, s16 ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s18, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s18, 0 -; GCN-NEXT: s_subb_u32 s17, s17, s11 -; GCN-NEXT: s_sub_u32 s19, s6, s10 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_or_b32 s15, s4, s5 +; GCN-NEXT: s_subb_u32 s19, s14, s11 +; GCN-NEXT: s_sub_u32 s20, s6, s10 +; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GCN-NEXT: s_or_b32 s14, s14, s15 +; GCN-NEXT: s_subb_u32 s14, s19, 0 +; GCN-NEXT: s_cmp_ge_u32 s14, s11 +; GCN-NEXT: s_cselect_b32 s15, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s20, s10 +; GCN-NEXT: s_cselect_b32 s19, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s14, s11 +; GCN-NEXT: s_cselect_b32 s14, s19, s15 +; GCN-NEXT: s_add_u32 s15, s16, 1 +; GCN-NEXT: s_addc_u32 s19, s17, 0 +; GCN-NEXT: s_add_u32 s20, s16, 2 +; GCN-NEXT: s_addc_u32 s21, s17, 0 +; GCN-NEXT: s_cmp_lg_u32 s14, 0 +; GCN-NEXT: s_cselect_b32 s14, s20, s15 +; GCN-NEXT: s_cselect_b32 s15, s21, s19 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_subb_u32 s4, s17, 0 +; GCN-NEXT: s_subb_u32 s4, s7, s18 ; GCN-NEXT: s_cmp_ge_u32 s4, s11 ; GCN-NEXT: s_cselect_b32 s5, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s19, s10 -; GCN-NEXT: s_cselect_b32 s17, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s4, s11 -; GCN-NEXT: s_cselect_b32 s4, s17, s5 -; GCN-NEXT: s_add_u32 s5, s14, 1 -; GCN-NEXT: s_addc_u32 s17, s15, 0 -; GCN-NEXT: s_add_u32 s19, s14, 2 -; GCN-NEXT: s_addc_u32 s20, s15, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_cselect_b32 s4, s19, s5 -; GCN-NEXT: s_cselect_b32 s5, s20, s17 -; GCN-NEXT: s_cmp_lg_u32 s18, 0 -; GCN-NEXT: s_subb_u32 s7, s7, s16 -; GCN-NEXT: s_cmp_ge_u32 s7, s11 -; GCN-NEXT: s_cselect_b32 s16, -1, 0 ; GCN-NEXT: s_cmp_ge_u32 s6, s10 ; GCN-NEXT: s_cselect_b32 s6, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s7, s11 -; GCN-NEXT: s_cselect_b32 s6, s6, s16 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_cselect_b32 s5, s5, s15 -; GCN-NEXT: s_cselect_b32 s4, s4, s14 +; GCN-NEXT: s_cmp_eq_u32 s4, s11 +; GCN-NEXT: s_cselect_b32 s4, s6, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_cselect_b32 s5, s15, s17 +; GCN-NEXT: s_cselect_b32 s4, s14, s16 ; GCN-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9] ; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GCN-NEXT: s_sub_u32 s4, s4, s6 @@ -208,7 +204,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_add_u32 s18, s16, 1 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_or_b32 s10, s10, s11 -; GCN-IR-NEXT: s_cmp_lg_u32 s10, 0 ; GCN-IR-NEXT: s_addc_u32 s10, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s16, 63, s16 @@ -242,7 +237,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_or_b32 s20, s20, s21 -; GCN-IR-NEXT: s_cmp_lg_u32 s20, 0 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[12:13], s[8:9] @@ -1195,10 +1189,9 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_addc_u32 s12, 0, s13 ; GCN-NEXT: s_add_u32 s13, s8, s9 ; GCN-NEXT: v_mov_b32_e32 v0, s13 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: s_addc_u32 s11, s11, s12 ; GCN-NEXT: s_mul_i32 s8, s2, s11 ; GCN-NEXT: v_readfirstlane_b32 s9, v0 @@ -1229,7 +1222,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_add_u32 s2, s13, s2 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: s_addc_u32 s8, s11, s10 ; GCN-NEXT: v_mul_hi_u32 v1, s2, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s8, 24 @@ -1238,48 +1230,46 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_readfirstlane_b32 s10, v1 ; GCN-NEXT: v_readfirstlane_b32 s9, v0 ; GCN-NEXT: s_add_u32 s8, s10, s8 -; GCN-NEXT: s_addc_u32 s10, 0, s9 -; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: s_addc_u32 s12, 0, s9 +; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 -; GCN-NEXT: s_mul_i32 s8, s7, s10 +; GCN-NEXT: s_mul_i32 s8, s7, s12 ; GCN-NEXT: v_readfirstlane_b32 s9, v0 -; GCN-NEXT: s_add_i32 s11, s9, s8 -; GCN-NEXT: s_sub_i32 s12, 0, s11 -; GCN-NEXT: s_mul_i32 s8, s6, s10 -; GCN-NEXT: s_sub_u32 s13, 24, s8 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s14, s8, s9 -; GCN-NEXT: s_cmp_lg_u32 s14, 0 -; GCN-NEXT: s_subb_u32 s12, s12, s7 -; GCN-NEXT: s_sub_u32 s15, s13, s6 +; GCN-NEXT: s_add_i32 s13, s9, s8 +; GCN-NEXT: s_sub_i32 s10, 0, s13 +; GCN-NEXT: s_mul_i32 s8, s6, s12 +; GCN-NEXT: s_sub_u32 s14, 24, s8 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_or_b32 s11, s8, s9 +; GCN-NEXT: s_subb_u32 s15, s10, s7 +; GCN-NEXT: s_sub_u32 s16, s14, s6 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_or_b32 s10, s10, s11 +; GCN-NEXT: s_subb_u32 s10, s15, 0 +; GCN-NEXT: s_cmp_ge_u32 s10, s7 +; GCN-NEXT: s_cselect_b32 s11, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s16, s6 +; GCN-NEXT: s_cselect_b32 s15, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s10, s7 +; GCN-NEXT: s_cselect_b32 s10, s15, s11 +; GCN-NEXT: s_add_u32 s11, s12, 1 +; GCN-NEXT: s_addc_u32 s15, 0, 0 +; GCN-NEXT: s_add_u32 s16, s12, 2 +; GCN-NEXT: s_addc_u32 s17, 0, 0 +; GCN-NEXT: s_cmp_lg_u32 s10, 0 +; GCN-NEXT: s_cselect_b32 s10, s16, s11 +; GCN-NEXT: s_cselect_b32 s11, s17, s15 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: s_subb_u32 s8, s12, 0 +; GCN-NEXT: s_subb_u32 s8, 0, s13 ; GCN-NEXT: s_cmp_ge_u32 s8, s7 ; GCN-NEXT: s_cselect_b32 s9, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s15, s6 -; GCN-NEXT: s_cselect_b32 s12, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s8, s7 -; GCN-NEXT: s_cselect_b32 s8, s12, s9 -; GCN-NEXT: s_add_u32 s9, s10, 1 -; GCN-NEXT: s_addc_u32 s12, 0, 0 -; GCN-NEXT: s_add_u32 s15, s10, 2 -; GCN-NEXT: s_addc_u32 s16, 0, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: s_cselect_b32 s8, s15, s9 -; GCN-NEXT: s_cselect_b32 s9, s16, s12 -; GCN-NEXT: s_cmp_lg_u32 s14, 0 -; GCN-NEXT: s_subb_u32 s11, 0, s11 -; GCN-NEXT: s_cmp_ge_u32 s11, s7 -; GCN-NEXT: s_cselect_b32 s12, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s13, s6 +; GCN-NEXT: s_cmp_ge_u32 s14, s6 ; GCN-NEXT: s_cselect_b32 s6, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s11, s7 -; GCN-NEXT: s_cselect_b32 s6, s6, s12 +; GCN-NEXT: s_cmp_eq_u32 s8, s7 +; GCN-NEXT: s_cselect_b32 s6, s6, s9 ; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_cselect_b32 s7, s9, 0 -; GCN-NEXT: s_cselect_b32 s6, s8, s10 +; GCN-NEXT: s_cselect_b32 s7, s11, 0 +; GCN-NEXT: s_cselect_b32 s6, s10, s12 ; GCN-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_subb_u32 s7, s7, s4 @@ -1315,7 +1305,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s12, s10, 1 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_or_b32 s8, s8, s9 -; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-IR-NEXT: s_addc_u32 s8, s11, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s10, 63, s10 @@ -1348,7 +1337,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s16, s16, 1 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_or_b32 s18, s18, s19 -; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0 ; GCN-IR-NEXT: s_addc_u32 s17, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll index 09596e9..7ddd90e 100644 --- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s define amdgpu_kernel void @sitofp_i16_to_f16( ; SI-LABEL: sitofp_i16_to_f16: diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index bbd1793..e12e31b 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -1513,7 +1513,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GCN-NEXT: s_sub_u32 s3, 0, s8 -; GCN-NEXT: s_subb_u32 s12, 0, s9 +; GCN-NEXT: s_subb_u32 s10, 0, s9 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1522,56 +1522,52 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s13, v1 -; GCN-NEXT: v_readfirstlane_b32 s10, v0 -; GCN-NEXT: s_mul_i32 s11, s3, s13 -; GCN-NEXT: s_mul_hi_u32 s15, s3, s10 -; GCN-NEXT: s_mul_i32 s14, s12, s10 -; GCN-NEXT: s_add_i32 s11, s15, s11 -; GCN-NEXT: s_add_i32 s11, s11, s14 -; GCN-NEXT: s_mul_i32 s16, s3, s10 -; GCN-NEXT: s_mul_i32 s15, s10, s11 -; GCN-NEXT: s_mul_hi_u32 s17, s10, s16 -; GCN-NEXT: s_mul_hi_u32 s14, s10, s11 +; GCN-NEXT: v_readfirstlane_b32 s11, v1 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_mul_i32 s13, s3, s11 +; GCN-NEXT: s_mul_hi_u32 s15, s3, s12 +; GCN-NEXT: s_mul_i32 s14, s10, s12 +; GCN-NEXT: s_add_i32 s13, s15, s13 +; GCN-NEXT: s_add_i32 s13, s13, s14 +; GCN-NEXT: s_mul_i32 s16, s3, s12 +; GCN-NEXT: s_mul_i32 s15, s12, s13 +; GCN-NEXT: s_mul_hi_u32 s17, s12, s16 +; GCN-NEXT: s_mul_hi_u32 s14, s12, s13 ; GCN-NEXT: s_add_u32 s15, s17, s15 ; GCN-NEXT: s_addc_u32 s14, 0, s14 -; GCN-NEXT: s_mul_hi_u32 s18, s13, s16 -; GCN-NEXT: s_mul_i32 s16, s13, s16 +; GCN-NEXT: s_mul_hi_u32 s18, s11, s16 +; GCN-NEXT: s_mul_i32 s16, s11, s16 ; GCN-NEXT: s_add_u32 s15, s15, s16 -; GCN-NEXT: s_mul_hi_u32 s17, s13, s11 +; GCN-NEXT: s_mul_hi_u32 s17, s11, s13 ; GCN-NEXT: s_addc_u32 s14, s14, s18 ; GCN-NEXT: s_addc_u32 s15, s17, 0 -; GCN-NEXT: s_mul_i32 s11, s13, s11 -; GCN-NEXT: s_add_u32 s11, s14, s11 +; GCN-NEXT: s_mul_i32 s13, s11, s13 +; GCN-NEXT: s_add_u32 s13, s14, s13 ; GCN-NEXT: s_addc_u32 s14, 0, s15 -; GCN-NEXT: s_add_u32 s15, s10, s11 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GCN-NEXT: s_addc_u32 s13, s13, s14 -; GCN-NEXT: s_mul_i32 s10, s3, s13 -; GCN-NEXT: s_mul_hi_u32 s11, s3, s15 -; GCN-NEXT: s_add_i32 s10, s11, s10 -; GCN-NEXT: s_mul_i32 s12, s12, s15 -; GCN-NEXT: s_add_i32 s10, s10, s12 -; GCN-NEXT: s_mul_i32 s3, s3, s15 -; GCN-NEXT: s_mul_hi_u32 s12, s13, s3 -; GCN-NEXT: s_mul_i32 s14, s13, s3 -; GCN-NEXT: s_mul_i32 s17, s15, s10 -; GCN-NEXT: s_mul_hi_u32 s3, s15, s3 -; GCN-NEXT: s_mul_hi_u32 s16, s15, s10 +; GCN-NEXT: s_add_u32 s12, s12, s13 +; GCN-NEXT: s_addc_u32 s11, s11, s14 +; GCN-NEXT: s_mul_i32 s13, s3, s11 +; GCN-NEXT: s_mul_hi_u32 s14, s3, s12 +; GCN-NEXT: s_add_i32 s13, s14, s13 +; GCN-NEXT: s_mul_i32 s10, s10, s12 +; GCN-NEXT: s_add_i32 s13, s13, s10 +; GCN-NEXT: s_mul_i32 s3, s3, s12 +; GCN-NEXT: s_mul_hi_u32 s14, s11, s3 +; GCN-NEXT: s_mul_i32 s15, s11, s3 +; GCN-NEXT: s_mul_i32 s17, s12, s13 +; GCN-NEXT: s_mul_hi_u32 s3, s12, s3 +; GCN-NEXT: s_mul_hi_u32 s16, s12, s13 ; GCN-NEXT: s_add_u32 s3, s3, s17 ; GCN-NEXT: s_addc_u32 s16, 0, s16 -; GCN-NEXT: s_add_u32 s3, s3, s14 -; GCN-NEXT: s_mul_hi_u32 s11, s13, s10 -; GCN-NEXT: s_addc_u32 s3, s16, s12 -; GCN-NEXT: s_addc_u32 s11, s11, 0 -; GCN-NEXT: s_mul_i32 s10, s13, s10 -; GCN-NEXT: s_add_u32 s3, s3, s10 -; GCN-NEXT: s_addc_u32 s12, 0, s11 -; GCN-NEXT: s_add_u32 s3, s15, s3 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GCN-NEXT: s_addc_u32 s14, s13, s12 +; GCN-NEXT: s_add_u32 s3, s3, s15 +; GCN-NEXT: s_mul_hi_u32 s10, s11, s13 +; GCN-NEXT: s_addc_u32 s3, s16, s14 +; GCN-NEXT: s_addc_u32 s10, s10, 0 +; GCN-NEXT: s_mul_i32 s13, s11, s13 +; GCN-NEXT: s_add_u32 s3, s3, s13 +; GCN-NEXT: s_addc_u32 s10, 0, s10 +; GCN-NEXT: s_add_u32 s3, s12, s3 +; GCN-NEXT: s_addc_u32 s14, s11, s10 ; GCN-NEXT: s_ashr_i32 s10, s5, 31 ; GCN-NEXT: s_add_u32 s12, s4, s10 ; GCN-NEXT: s_mov_b32 s11, s10 @@ -1600,11 +1596,9 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: s_mul_i32 s3, s8, s3 ; GCN-NEXT: s_sub_u32 s3, s12, s3 ; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 ; GCN-NEXT: s_subb_u32 s12, s16, s9 ; GCN-NEXT: s_sub_u32 s18, s3, s8 ; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s19, s12, 0 ; GCN-NEXT: s_cmp_ge_u32 s19, s9 ; GCN-NEXT: s_cselect_b32 s20, -1, 0 @@ -1614,12 +1608,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: s_cselect_b32 s20, s21, s20 ; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s12, s12, s9 -; GCN-NEXT: s_sub_u32 s21, s18, s8 -; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GCN-NEXT: s_sub_u32 s16, s18, s8 ; GCN-NEXT: s_subb_u32 s12, s12, 0 ; GCN-NEXT: s_cmp_lg_u32 s20, 0 -; GCN-NEXT: s_cselect_b32 s16, s21, s18 +; GCN-NEXT: s_cselect_b32 s16, s16, s18 ; GCN-NEXT: s_cselect_b32 s12, s12, s19 ; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 ; GCN-NEXT: s_subb_u32 s5, s13, s5 @@ -1931,11 +1923,9 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TONGA-NEXT: v_readfirstlane_b32 s14, v0 ; TONGA-NEXT: s_sub_u32 s12, s12, s14 ; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s3, s3, s7 ; TONGA-NEXT: s_sub_u32 s18, s12, s6 ; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s19, s3, 0 ; TONGA-NEXT: s_cmp_ge_u32 s19, s7 ; TONGA-NEXT: s_cselect_b32 s20, -1, 0 @@ -1945,12 +1935,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TONGA-NEXT: s_cselect_b32 s20, s21, s20 ; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s3, s3, s7 -; TONGA-NEXT: s_sub_u32 s21, s18, s6 -; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 +; TONGA-NEXT: s_sub_u32 s16, s18, s6 ; TONGA-NEXT: s_subb_u32 s3, s3, 0 ; TONGA-NEXT: s_cmp_lg_u32 s20, 0 -; TONGA-NEXT: s_cselect_b32 s16, s21, s18 +; TONGA-NEXT: s_cselect_b32 s16, s16, s18 ; TONGA-NEXT: s_cselect_b32 s3, s3, s19 ; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s5, s13, s5 @@ -2730,7 +2718,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GCN-NEXT: s_sub_u32 s9, 0, s6 -; GCN-NEXT: s_subb_u32 s16, 0, s7 +; GCN-NEXT: s_subb_u32 s14, 0, s7 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2739,56 +2727,52 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s17, v1 -; GCN-NEXT: v_readfirstlane_b32 s14, v0 -; GCN-NEXT: s_mul_i32 s15, s9, s17 -; GCN-NEXT: s_mul_hi_u32 s19, s9, s14 -; GCN-NEXT: s_mul_i32 s18, s16, s14 -; GCN-NEXT: s_add_i32 s15, s19, s15 -; GCN-NEXT: s_add_i32 s15, s15, s18 -; GCN-NEXT: s_mul_i32 s20, s9, s14 -; GCN-NEXT: s_mul_i32 s19, s14, s15 -; GCN-NEXT: s_mul_hi_u32 s21, s14, s20 -; GCN-NEXT: s_mul_hi_u32 s18, s14, s15 +; GCN-NEXT: v_readfirstlane_b32 s15, v1 +; GCN-NEXT: v_readfirstlane_b32 s16, v0 +; GCN-NEXT: s_mul_i32 s17, s9, s15 +; GCN-NEXT: s_mul_hi_u32 s19, s9, s16 +; GCN-NEXT: s_mul_i32 s18, s14, s16 +; GCN-NEXT: s_add_i32 s17, s19, s17 +; GCN-NEXT: s_add_i32 s17, s17, s18 +; GCN-NEXT: s_mul_i32 s20, s9, s16 +; GCN-NEXT: s_mul_i32 s19, s16, s17 +; GCN-NEXT: s_mul_hi_u32 s21, s16, s20 +; GCN-NEXT: s_mul_hi_u32 s18, s16, s17 ; GCN-NEXT: s_add_u32 s19, s21, s19 ; GCN-NEXT: s_addc_u32 s18, 0, s18 -; GCN-NEXT: s_mul_hi_u32 s22, s17, s20 -; GCN-NEXT: s_mul_i32 s20, s17, s20 +; GCN-NEXT: s_mul_hi_u32 s22, s15, s20 +; GCN-NEXT: s_mul_i32 s20, s15, s20 ; GCN-NEXT: s_add_u32 s19, s19, s20 -; GCN-NEXT: s_mul_hi_u32 s21, s17, s15 +; GCN-NEXT: s_mul_hi_u32 s21, s15, s17 ; GCN-NEXT: s_addc_u32 s18, s18, s22 ; GCN-NEXT: s_addc_u32 s19, s21, 0 -; GCN-NEXT: s_mul_i32 s15, s17, s15 -; GCN-NEXT: s_add_u32 s15, s18, s15 +; GCN-NEXT: s_mul_i32 s17, s15, s17 +; GCN-NEXT: s_add_u32 s17, s18, s17 ; GCN-NEXT: s_addc_u32 s18, 0, s19 -; GCN-NEXT: s_add_u32 s19, s14, s15 -; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GCN-NEXT: s_addc_u32 s17, s17, s18 -; GCN-NEXT: s_mul_i32 s14, s9, s17 -; GCN-NEXT: s_mul_hi_u32 s15, s9, s19 -; GCN-NEXT: s_add_i32 s14, s15, s14 -; GCN-NEXT: s_mul_i32 s16, s16, s19 -; GCN-NEXT: s_add_i32 s14, s14, s16 -; GCN-NEXT: s_mul_i32 s9, s9, s19 -; GCN-NEXT: s_mul_hi_u32 s16, s17, s9 -; GCN-NEXT: s_mul_i32 s18, s17, s9 -; GCN-NEXT: s_mul_i32 s21, s19, s14 -; GCN-NEXT: s_mul_hi_u32 s9, s19, s9 -; GCN-NEXT: s_mul_hi_u32 s20, s19, s14 +; GCN-NEXT: s_add_u32 s16, s16, s17 +; GCN-NEXT: s_addc_u32 s15, s15, s18 +; GCN-NEXT: s_mul_i32 s17, s9, s15 +; GCN-NEXT: s_mul_hi_u32 s18, s9, s16 +; GCN-NEXT: s_add_i32 s17, s18, s17 +; GCN-NEXT: s_mul_i32 s14, s14, s16 +; GCN-NEXT: s_add_i32 s17, s17, s14 +; GCN-NEXT: s_mul_i32 s9, s9, s16 +; GCN-NEXT: s_mul_hi_u32 s18, s15, s9 +; GCN-NEXT: s_mul_i32 s19, s15, s9 +; GCN-NEXT: s_mul_i32 s21, s16, s17 +; GCN-NEXT: s_mul_hi_u32 s9, s16, s9 +; GCN-NEXT: s_mul_hi_u32 s20, s16, s17 ; GCN-NEXT: s_add_u32 s9, s9, s21 ; GCN-NEXT: s_addc_u32 s20, 0, s20 -; GCN-NEXT: s_add_u32 s9, s9, s18 -; GCN-NEXT: s_mul_hi_u32 s15, s17, s14 -; GCN-NEXT: s_addc_u32 s9, s20, s16 -; GCN-NEXT: s_addc_u32 s15, s15, 0 -; GCN-NEXT: s_mul_i32 s14, s17, s14 -; GCN-NEXT: s_add_u32 s9, s9, s14 -; GCN-NEXT: s_addc_u32 s16, 0, s15 -; GCN-NEXT: s_add_u32 s9, s19, s9 -; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GCN-NEXT: s_addc_u32 s18, s17, s16 +; GCN-NEXT: s_add_u32 s9, s9, s19 +; GCN-NEXT: s_mul_hi_u32 s14, s15, s17 +; GCN-NEXT: s_addc_u32 s9, s20, s18 +; GCN-NEXT: s_addc_u32 s14, s14, 0 +; GCN-NEXT: s_mul_i32 s17, s15, s17 +; GCN-NEXT: s_add_u32 s9, s9, s17 +; GCN-NEXT: s_addc_u32 s14, 0, s14 +; GCN-NEXT: s_add_u32 s9, s16, s9 +; GCN-NEXT: s_addc_u32 s18, s15, s14 ; GCN-NEXT: s_ashr_i32 s14, s11, 31 ; GCN-NEXT: s_add_u32 s16, s10, s14 ; GCN-NEXT: s_mov_b32 s15, s14 @@ -2817,11 +2801,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s9, s6, s9 ; GCN-NEXT: s_sub_u32 s9, s16, s9 ; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s16, s20, s7 ; GCN-NEXT: s_sub_u32 s22, s9, s6 ; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s23, s16, 0 ; GCN-NEXT: s_cmp_ge_u32 s23, s7 ; GCN-NEXT: s_cselect_b32 s24, -1, 0 @@ -2831,12 +2813,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s24, s25, s24 ; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s16, s16, s7 -; GCN-NEXT: s_sub_u32 s25, s22, s6 -; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 +; GCN-NEXT: s_sub_u32 s20, s22, s6 ; GCN-NEXT: s_subb_u32 s16, s16, 0 ; GCN-NEXT: s_cmp_lg_u32 s24, 0 -; GCN-NEXT: s_cselect_b32 s20, s25, s22 +; GCN-NEXT: s_cselect_b32 s20, s20, s22 ; GCN-NEXT: s_cselect_b32 s16, s16, s23 ; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s11, s17, s11 @@ -2887,7 +2867,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GCN-NEXT: s_sub_u32 s3, 0, s10 -; GCN-NEXT: s_subb_u32 s14, 0, s11 +; GCN-NEXT: s_subb_u32 s12, 0, s11 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2896,56 +2876,52 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s15, v1 -; GCN-NEXT: v_readfirstlane_b32 s12, v0 -; GCN-NEXT: s_mul_i32 s13, s3, s15 -; GCN-NEXT: s_mul_hi_u32 s17, s3, s12 -; GCN-NEXT: s_mul_i32 s16, s14, s12 -; GCN-NEXT: s_add_i32 s13, s17, s13 -; GCN-NEXT: s_add_i32 s13, s13, s16 -; GCN-NEXT: s_mul_i32 s18, s3, s12 -; GCN-NEXT: s_mul_i32 s17, s12, s13 -; GCN-NEXT: s_mul_hi_u32 s19, s12, s18 -; GCN-NEXT: s_mul_hi_u32 s16, s12, s13 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-NEXT: s_mul_i32 s15, s3, s13 +; GCN-NEXT: s_mul_hi_u32 s17, s3, s14 +; GCN-NEXT: s_mul_i32 s16, s12, s14 +; GCN-NEXT: s_add_i32 s15, s17, s15 +; GCN-NEXT: s_add_i32 s15, s15, s16 +; GCN-NEXT: s_mul_i32 s18, s3, s14 +; GCN-NEXT: s_mul_i32 s17, s14, s15 +; GCN-NEXT: s_mul_hi_u32 s19, s14, s18 +; GCN-NEXT: s_mul_hi_u32 s16, s14, s15 ; GCN-NEXT: s_add_u32 s17, s19, s17 ; GCN-NEXT: s_addc_u32 s16, 0, s16 -; GCN-NEXT: s_mul_hi_u32 s20, s15, s18 -; GCN-NEXT: s_mul_i32 s18, s15, s18 +; GCN-NEXT: s_mul_hi_u32 s20, s13, s18 +; GCN-NEXT: s_mul_i32 s18, s13, s18 ; GCN-NEXT: s_add_u32 s17, s17, s18 -; GCN-NEXT: s_mul_hi_u32 s19, s15, s13 +; GCN-NEXT: s_mul_hi_u32 s19, s13, s15 ; GCN-NEXT: s_addc_u32 s16, s16, s20 ; GCN-NEXT: s_addc_u32 s17, s19, 0 -; GCN-NEXT: s_mul_i32 s13, s15, s13 -; GCN-NEXT: s_add_u32 s13, s16, s13 +; GCN-NEXT: s_mul_i32 s15, s13, s15 +; GCN-NEXT: s_add_u32 s15, s16, s15 ; GCN-NEXT: s_addc_u32 s16, 0, s17 -; GCN-NEXT: s_add_u32 s17, s12, s13 -; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GCN-NEXT: s_addc_u32 s15, s15, s16 -; GCN-NEXT: s_mul_i32 s12, s3, s15 -; GCN-NEXT: s_mul_hi_u32 s13, s3, s17 -; GCN-NEXT: s_add_i32 s12, s13, s12 -; GCN-NEXT: s_mul_i32 s14, s14, s17 -; GCN-NEXT: s_add_i32 s12, s12, s14 -; GCN-NEXT: s_mul_i32 s3, s3, s17 -; GCN-NEXT: s_mul_hi_u32 s14, s15, s3 -; GCN-NEXT: s_mul_i32 s16, s15, s3 -; GCN-NEXT: s_mul_i32 s19, s17, s12 -; GCN-NEXT: s_mul_hi_u32 s3, s17, s3 -; GCN-NEXT: s_mul_hi_u32 s18, s17, s12 +; GCN-NEXT: s_add_u32 s14, s14, s15 +; GCN-NEXT: s_addc_u32 s13, s13, s16 +; GCN-NEXT: s_mul_i32 s15, s3, s13 +; GCN-NEXT: s_mul_hi_u32 s16, s3, s14 +; GCN-NEXT: s_add_i32 s15, s16, s15 +; GCN-NEXT: s_mul_i32 s12, s12, s14 +; GCN-NEXT: s_add_i32 s15, s15, s12 +; GCN-NEXT: s_mul_i32 s3, s3, s14 +; GCN-NEXT: s_mul_hi_u32 s16, s13, s3 +; GCN-NEXT: s_mul_i32 s17, s13, s3 +; GCN-NEXT: s_mul_i32 s19, s14, s15 +; GCN-NEXT: s_mul_hi_u32 s3, s14, s3 +; GCN-NEXT: s_mul_hi_u32 s18, s14, s15 ; GCN-NEXT: s_add_u32 s3, s3, s19 ; GCN-NEXT: s_addc_u32 s18, 0, s18 -; GCN-NEXT: s_add_u32 s3, s3, s16 -; GCN-NEXT: s_mul_hi_u32 s13, s15, s12 -; GCN-NEXT: s_addc_u32 s3, s18, s14 -; GCN-NEXT: s_addc_u32 s13, s13, 0 -; GCN-NEXT: s_mul_i32 s12, s15, s12 -; GCN-NEXT: s_add_u32 s3, s3, s12 -; GCN-NEXT: s_addc_u32 s14, 0, s13 -; GCN-NEXT: s_add_u32 s3, s17, s3 -; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GCN-NEXT: s_addc_u32 s16, s15, s14 +; GCN-NEXT: s_add_u32 s3, s3, s17 +; GCN-NEXT: s_mul_hi_u32 s12, s13, s15 +; GCN-NEXT: s_addc_u32 s3, s18, s16 +; GCN-NEXT: s_addc_u32 s12, s12, 0 +; GCN-NEXT: s_mul_i32 s15, s13, s15 +; GCN-NEXT: s_add_u32 s3, s3, s15 +; GCN-NEXT: s_addc_u32 s12, 0, s12 +; GCN-NEXT: s_add_u32 s3, s14, s3 +; GCN-NEXT: s_addc_u32 s16, s13, s12 ; GCN-NEXT: s_ashr_i32 s12, s5, 31 ; GCN-NEXT: s_add_u32 s14, s4, s12 ; GCN-NEXT: s_mov_b32 s13, s12 @@ -2974,11 +2950,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s3, s10, s3 ; GCN-NEXT: s_sub_u32 s3, s14, s3 ; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s14, s18, s11 ; GCN-NEXT: s_sub_u32 s20, s3, s10 ; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s21, s14, 0 ; GCN-NEXT: s_cmp_ge_u32 s21, s11 ; GCN-NEXT: s_cselect_b32 s22, -1, 0 @@ -2988,12 +2962,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s22, s23, s22 ; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s14, s14, s11 -; GCN-NEXT: s_sub_u32 s23, s20, s10 -; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 +; GCN-NEXT: s_sub_u32 s18, s20, s10 ; GCN-NEXT: s_subb_u32 s14, s14, 0 ; GCN-NEXT: s_cmp_lg_u32 s22, 0 -; GCN-NEXT: s_cselect_b32 s18, s23, s20 +; GCN-NEXT: s_cselect_b32 s18, s18, s20 ; GCN-NEXT: s_cselect_b32 s14, s14, s21 ; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s5, s15, s5 @@ -3463,11 +3435,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_readfirstlane_b32 s14, v0 ; TONGA-NEXT: s_sub_u32 s12, s12, s14 ; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, s7 ; TONGA-NEXT: s_sub_u32 s18, s12, s6 ; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s19, s1, 0 ; TONGA-NEXT: s_cmp_ge_u32 s19, s7 ; TONGA-NEXT: s_cselect_b32 s20, -1, 0 @@ -3477,12 +3447,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: s_cselect_b32 s20, s21, s20 ; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, s7 -; TONGA-NEXT: s_sub_u32 s21, s18, s6 -; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 +; TONGA-NEXT: s_sub_u32 s16, s18, s6 ; TONGA-NEXT: s_subb_u32 s1, s1, 0 ; TONGA-NEXT: s_cmp_lg_u32 s20, 0 -; TONGA-NEXT: s_cselect_b32 s16, s21, s18 +; TONGA-NEXT: s_cselect_b32 s16, s16, s18 ; TONGA-NEXT: s_cselect_b32 s1, s1, s19 ; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s3, s13, s3 @@ -4934,7 +4902,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GCN-NEXT: s_sub_u32 s17, 0, s6 -; GCN-NEXT: s_subb_u32 s24, 0, s7 +; GCN-NEXT: s_subb_u32 s22, 0, s7 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -4943,56 +4911,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s25, v1 -; GCN-NEXT: v_readfirstlane_b32 s22, v0 -; GCN-NEXT: s_mul_i32 s23, s17, s25 -; GCN-NEXT: s_mul_hi_u32 s27, s17, s22 -; GCN-NEXT: s_mul_i32 s26, s24, s22 -; GCN-NEXT: s_add_i32 s23, s27, s23 -; GCN-NEXT: s_add_i32 s23, s23, s26 -; GCN-NEXT: s_mul_i32 s28, s17, s22 -; GCN-NEXT: s_mul_i32 s27, s22, s23 -; GCN-NEXT: s_mul_hi_u32 s29, s22, s28 -; GCN-NEXT: s_mul_hi_u32 s26, s22, s23 +; GCN-NEXT: v_readfirstlane_b32 s23, v1 +; GCN-NEXT: v_readfirstlane_b32 s24, v0 +; GCN-NEXT: s_mul_i32 s25, s17, s23 +; GCN-NEXT: s_mul_hi_u32 s27, s17, s24 +; GCN-NEXT: s_mul_i32 s26, s22, s24 +; GCN-NEXT: s_add_i32 s25, s27, s25 +; GCN-NEXT: s_add_i32 s25, s25, s26 +; GCN-NEXT: s_mul_i32 s28, s17, s24 +; GCN-NEXT: s_mul_i32 s27, s24, s25 +; GCN-NEXT: s_mul_hi_u32 s29, s24, s28 +; GCN-NEXT: s_mul_hi_u32 s26, s24, s25 ; GCN-NEXT: s_add_u32 s27, s29, s27 ; GCN-NEXT: s_addc_u32 s26, 0, s26 -; GCN-NEXT: s_mul_hi_u32 s30, s25, s28 -; GCN-NEXT: s_mul_i32 s28, s25, s28 +; GCN-NEXT: s_mul_hi_u32 s30, s23, s28 +; GCN-NEXT: s_mul_i32 s28, s23, s28 ; GCN-NEXT: s_add_u32 s27, s27, s28 -; GCN-NEXT: s_mul_hi_u32 s29, s25, s23 +; GCN-NEXT: s_mul_hi_u32 s29, s23, s25 ; GCN-NEXT: s_addc_u32 s26, s26, s30 ; GCN-NEXT: s_addc_u32 s27, s29, 0 -; GCN-NEXT: s_mul_i32 s23, s25, s23 -; GCN-NEXT: s_add_u32 s23, s26, s23 +; GCN-NEXT: s_mul_i32 s25, s23, s25 +; GCN-NEXT: s_add_u32 s25, s26, s25 ; GCN-NEXT: s_addc_u32 s26, 0, s27 -; GCN-NEXT: s_add_u32 s27, s22, s23 -; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 -; GCN-NEXT: s_addc_u32 s25, s25, s26 -; GCN-NEXT: s_mul_i32 s22, s17, s25 -; GCN-NEXT: s_mul_hi_u32 s23, s17, s27 -; GCN-NEXT: s_add_i32 s22, s23, s22 -; GCN-NEXT: s_mul_i32 s24, s24, s27 -; GCN-NEXT: s_add_i32 s22, s22, s24 -; GCN-NEXT: s_mul_i32 s17, s17, s27 -; GCN-NEXT: s_mul_hi_u32 s24, s25, s17 -; GCN-NEXT: s_mul_i32 s26, s25, s17 -; GCN-NEXT: s_mul_i32 s29, s27, s22 -; GCN-NEXT: s_mul_hi_u32 s17, s27, s17 -; GCN-NEXT: s_mul_hi_u32 s28, s27, s22 +; GCN-NEXT: s_add_u32 s24, s24, s25 +; GCN-NEXT: s_addc_u32 s23, s23, s26 +; GCN-NEXT: s_mul_i32 s25, s17, s23 +; GCN-NEXT: s_mul_hi_u32 s26, s17, s24 +; GCN-NEXT: s_add_i32 s25, s26, s25 +; GCN-NEXT: s_mul_i32 s22, s22, s24 +; GCN-NEXT: s_add_i32 s25, s25, s22 +; GCN-NEXT: s_mul_i32 s17, s17, s24 +; GCN-NEXT: s_mul_hi_u32 s26, s23, s17 +; GCN-NEXT: s_mul_i32 s27, s23, s17 +; GCN-NEXT: s_mul_i32 s29, s24, s25 +; GCN-NEXT: s_mul_hi_u32 s17, s24, s17 +; GCN-NEXT: s_mul_hi_u32 s28, s24, s25 ; GCN-NEXT: s_add_u32 s17, s17, s29 ; GCN-NEXT: s_addc_u32 s28, 0, s28 -; GCN-NEXT: s_add_u32 s17, s17, s26 -; GCN-NEXT: s_mul_hi_u32 s23, s25, s22 -; GCN-NEXT: s_addc_u32 s17, s28, s24 -; GCN-NEXT: s_addc_u32 s23, s23, 0 -; GCN-NEXT: s_mul_i32 s22, s25, s22 -; GCN-NEXT: s_add_u32 s17, s17, s22 -; GCN-NEXT: s_addc_u32 s24, 0, s23 -; GCN-NEXT: s_add_u32 s17, s27, s17 -; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 -; GCN-NEXT: s_addc_u32 s26, s25, s24 +; GCN-NEXT: s_add_u32 s17, s17, s27 +; GCN-NEXT: s_mul_hi_u32 s22, s23, s25 +; GCN-NEXT: s_addc_u32 s17, s28, s26 +; GCN-NEXT: s_addc_u32 s22, s22, 0 +; GCN-NEXT: s_mul_i32 s25, s23, s25 +; GCN-NEXT: s_add_u32 s17, s17, s25 +; GCN-NEXT: s_addc_u32 s22, 0, s22 +; GCN-NEXT: s_add_u32 s17, s24, s17 +; GCN-NEXT: s_addc_u32 s26, s23, s22 ; GCN-NEXT: s_ashr_i32 s22, s19, 31 ; GCN-NEXT: s_add_u32 s24, s18, s22 ; GCN-NEXT: s_mov_b32 s23, s22 @@ -5021,11 +4985,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s17, s6, s17 ; GCN-NEXT: s_sub_u32 s17, s24, s17 ; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s24, s28, s7 ; GCN-NEXT: s_sub_u32 s30, s17, s6 ; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0 ; GCN-NEXT: s_subb_u32 s31, s24, 0 ; GCN-NEXT: s_cmp_ge_u32 s31, s7 ; GCN-NEXT: s_cselect_b32 s33, -1, 0 @@ -5035,12 +4997,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s33, s34, s33 ; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0 ; GCN-NEXT: s_subb_u32 s24, s24, s7 -; GCN-NEXT: s_sub_u32 s34, s30, s6 -; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0 +; GCN-NEXT: s_sub_u32 s28, s30, s6 ; GCN-NEXT: s_subb_u32 s24, s24, 0 ; GCN-NEXT: s_cmp_lg_u32 s33, 0 -; GCN-NEXT: s_cselect_b32 s28, s34, s30 +; GCN-NEXT: s_cselect_b32 s28, s28, s30 ; GCN-NEXT: s_cselect_b32 s24, s24, s31 ; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s19, s25, s19 @@ -5091,7 +5051,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s18 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s19 ; GCN-NEXT: s_sub_u32 s13, 0, s18 -; GCN-NEXT: s_subb_u32 s22, 0, s19 +; GCN-NEXT: s_subb_u32 s20, 0, s19 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -5100,56 +5060,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s23, v1 -; GCN-NEXT: v_readfirstlane_b32 s20, v0 -; GCN-NEXT: s_mul_i32 s21, s13, s23 -; GCN-NEXT: s_mul_hi_u32 s25, s13, s20 -; GCN-NEXT: s_mul_i32 s24, s22, s20 -; GCN-NEXT: s_add_i32 s21, s25, s21 -; GCN-NEXT: s_add_i32 s21, s21, s24 -; GCN-NEXT: s_mul_i32 s26, s13, s20 -; GCN-NEXT: s_mul_i32 s25, s20, s21 -; GCN-NEXT: s_mul_hi_u32 s27, s20, s26 -; GCN-NEXT: s_mul_hi_u32 s24, s20, s21 +; GCN-NEXT: v_readfirstlane_b32 s21, v1 +; GCN-NEXT: v_readfirstlane_b32 s22, v0 +; GCN-NEXT: s_mul_i32 s23, s13, s21 +; GCN-NEXT: s_mul_hi_u32 s25, s13, s22 +; GCN-NEXT: s_mul_i32 s24, s20, s22 +; GCN-NEXT: s_add_i32 s23, s25, s23 +; GCN-NEXT: s_add_i32 s23, s23, s24 +; GCN-NEXT: s_mul_i32 s26, s13, s22 +; GCN-NEXT: s_mul_i32 s25, s22, s23 +; GCN-NEXT: s_mul_hi_u32 s27, s22, s26 +; GCN-NEXT: s_mul_hi_u32 s24, s22, s23 ; GCN-NEXT: s_add_u32 s25, s27, s25 ; GCN-NEXT: s_addc_u32 s24, 0, s24 -; GCN-NEXT: s_mul_hi_u32 s28, s23, s26 -; GCN-NEXT: s_mul_i32 s26, s23, s26 +; GCN-NEXT: s_mul_hi_u32 s28, s21, s26 +; GCN-NEXT: s_mul_i32 s26, s21, s26 ; GCN-NEXT: s_add_u32 s25, s25, s26 -; GCN-NEXT: s_mul_hi_u32 s27, s23, s21 +; GCN-NEXT: s_mul_hi_u32 s27, s21, s23 ; GCN-NEXT: s_addc_u32 s24, s24, s28 ; GCN-NEXT: s_addc_u32 s25, s27, 0 -; GCN-NEXT: s_mul_i32 s21, s23, s21 -; GCN-NEXT: s_add_u32 s21, s24, s21 +; GCN-NEXT: s_mul_i32 s23, s21, s23 +; GCN-NEXT: s_add_u32 s23, s24, s23 ; GCN-NEXT: s_addc_u32 s24, 0, s25 -; GCN-NEXT: s_add_u32 s25, s20, s21 -; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 -; GCN-NEXT: s_addc_u32 s23, s23, s24 -; GCN-NEXT: s_mul_i32 s20, s13, s23 -; GCN-NEXT: s_mul_hi_u32 s21, s13, s25 -; GCN-NEXT: s_add_i32 s20, s21, s20 -; GCN-NEXT: s_mul_i32 s22, s22, s25 -; GCN-NEXT: s_add_i32 s20, s20, s22 -; GCN-NEXT: s_mul_i32 s13, s13, s25 -; GCN-NEXT: s_mul_hi_u32 s22, s23, s13 -; GCN-NEXT: s_mul_i32 s24, s23, s13 -; GCN-NEXT: s_mul_i32 s27, s25, s20 -; GCN-NEXT: s_mul_hi_u32 s13, s25, s13 -; GCN-NEXT: s_mul_hi_u32 s26, s25, s20 +; GCN-NEXT: s_add_u32 s22, s22, s23 +; GCN-NEXT: s_addc_u32 s21, s21, s24 +; GCN-NEXT: s_mul_i32 s23, s13, s21 +; GCN-NEXT: s_mul_hi_u32 s24, s13, s22 +; GCN-NEXT: s_add_i32 s23, s24, s23 +; GCN-NEXT: s_mul_i32 s20, s20, s22 +; GCN-NEXT: s_add_i32 s23, s23, s20 +; GCN-NEXT: s_mul_i32 s13, s13, s22 +; GCN-NEXT: s_mul_hi_u32 s24, s21, s13 +; GCN-NEXT: s_mul_i32 s25, s21, s13 +; GCN-NEXT: s_mul_i32 s27, s22, s23 +; GCN-NEXT: s_mul_hi_u32 s13, s22, s13 +; GCN-NEXT: s_mul_hi_u32 s26, s22, s23 ; GCN-NEXT: s_add_u32 s13, s13, s27 ; GCN-NEXT: s_addc_u32 s26, 0, s26 -; GCN-NEXT: s_add_u32 s13, s13, s24 -; GCN-NEXT: s_mul_hi_u32 s21, s23, s20 -; GCN-NEXT: s_addc_u32 s13, s26, s22 -; GCN-NEXT: s_addc_u32 s21, s21, 0 -; GCN-NEXT: s_mul_i32 s20, s23, s20 -; GCN-NEXT: s_add_u32 s13, s13, s20 -; GCN-NEXT: s_addc_u32 s22, 0, s21 -; GCN-NEXT: s_add_u32 s13, s25, s13 -; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 -; GCN-NEXT: s_addc_u32 s24, s23, s22 +; GCN-NEXT: s_add_u32 s13, s13, s25 +; GCN-NEXT: s_mul_hi_u32 s20, s21, s23 +; GCN-NEXT: s_addc_u32 s13, s26, s24 +; GCN-NEXT: s_addc_u32 s20, s20, 0 +; GCN-NEXT: s_mul_i32 s23, s21, s23 +; GCN-NEXT: s_add_u32 s13, s13, s23 +; GCN-NEXT: s_addc_u32 s20, 0, s20 +; GCN-NEXT: s_add_u32 s13, s22, s13 +; GCN-NEXT: s_addc_u32 s24, s21, s20 ; GCN-NEXT: s_ashr_i32 s20, s15, 31 ; GCN-NEXT: s_add_u32 s22, s14, s20 ; GCN-NEXT: s_mov_b32 s21, s20 @@ -5178,11 +5134,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s13, s18, s13 ; GCN-NEXT: s_sub_u32 s13, s22, s13 ; GCN-NEXT: s_cselect_b64 s[24:25], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0 ; GCN-NEXT: s_subb_u32 s22, s26, s19 ; GCN-NEXT: s_sub_u32 s28, s13, s18 ; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s29, s22, 0 ; GCN-NEXT: s_cmp_ge_u32 s29, s19 ; GCN-NEXT: s_cselect_b32 s30, -1, 0 @@ -5192,12 +5146,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s30, s31, s30 ; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s22, s22, s19 -; GCN-NEXT: s_sub_u32 s31, s28, s18 -; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 +; GCN-NEXT: s_sub_u32 s26, s28, s18 ; GCN-NEXT: s_subb_u32 s22, s22, 0 ; GCN-NEXT: s_cmp_lg_u32 s30, 0 -; GCN-NEXT: s_cselect_b32 s26, s31, s28 +; GCN-NEXT: s_cselect_b32 s26, s26, s28 ; GCN-NEXT: s_cselect_b32 s22, s22, s29 ; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0 ; GCN-NEXT: s_subb_u32 s15, s23, s15 @@ -5257,7 +5209,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s14 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s15 ; GCN-NEXT: s_sub_u32 s9, 0, s14 -; GCN-NEXT: s_subb_u32 s18, 0, s15 +; GCN-NEXT: s_subb_u32 s16, 0, s15 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -5266,56 +5218,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s19, v1 -; GCN-NEXT: v_readfirstlane_b32 s16, v0 -; GCN-NEXT: s_mul_i32 s17, s9, s19 -; GCN-NEXT: s_mul_hi_u32 s21, s9, s16 -; GCN-NEXT: s_mul_i32 s20, s18, s16 -; GCN-NEXT: s_add_i32 s17, s21, s17 -; GCN-NEXT: s_add_i32 s17, s17, s20 -; GCN-NEXT: s_mul_i32 s22, s9, s16 -; GCN-NEXT: s_mul_i32 s21, s16, s17 -; GCN-NEXT: s_mul_hi_u32 s23, s16, s22 -; GCN-NEXT: s_mul_hi_u32 s20, s16, s17 +; GCN-NEXT: v_readfirstlane_b32 s17, v1 +; GCN-NEXT: v_readfirstlane_b32 s18, v0 +; GCN-NEXT: s_mul_i32 s19, s9, s17 +; GCN-NEXT: s_mul_hi_u32 s21, s9, s18 +; GCN-NEXT: s_mul_i32 s20, s16, s18 +; GCN-NEXT: s_add_i32 s19, s21, s19 +; GCN-NEXT: s_add_i32 s19, s19, s20 +; GCN-NEXT: s_mul_i32 s22, s9, s18 +; GCN-NEXT: s_mul_i32 s21, s18, s19 +; GCN-NEXT: s_mul_hi_u32 s23, s18, s22 +; GCN-NEXT: s_mul_hi_u32 s20, s18, s19 ; GCN-NEXT: s_add_u32 s21, s23, s21 ; GCN-NEXT: s_addc_u32 s20, 0, s20 -; GCN-NEXT: s_mul_hi_u32 s24, s19, s22 -; GCN-NEXT: s_mul_i32 s22, s19, s22 +; GCN-NEXT: s_mul_hi_u32 s24, s17, s22 +; GCN-NEXT: s_mul_i32 s22, s17, s22 ; GCN-NEXT: s_add_u32 s21, s21, s22 -; GCN-NEXT: s_mul_hi_u32 s23, s19, s17 +; GCN-NEXT: s_mul_hi_u32 s23, s17, s19 ; GCN-NEXT: s_addc_u32 s20, s20, s24 ; GCN-NEXT: s_addc_u32 s21, s23, 0 -; GCN-NEXT: s_mul_i32 s17, s19, s17 -; GCN-NEXT: s_add_u32 s17, s20, s17 +; GCN-NEXT: s_mul_i32 s19, s17, s19 +; GCN-NEXT: s_add_u32 s19, s20, s19 ; GCN-NEXT: s_addc_u32 s20, 0, s21 -; GCN-NEXT: s_add_u32 s21, s16, s17 -; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 -; GCN-NEXT: s_addc_u32 s19, s19, s20 -; GCN-NEXT: s_mul_i32 s16, s9, s19 -; GCN-NEXT: s_mul_hi_u32 s17, s9, s21 -; GCN-NEXT: s_add_i32 s16, s17, s16 -; GCN-NEXT: s_mul_i32 s18, s18, s21 -; GCN-NEXT: s_add_i32 s16, s16, s18 -; GCN-NEXT: s_mul_i32 s9, s9, s21 -; GCN-NEXT: s_mul_hi_u32 s18, s19, s9 -; GCN-NEXT: s_mul_i32 s20, s19, s9 -; GCN-NEXT: s_mul_i32 s23, s21, s16 -; GCN-NEXT: s_mul_hi_u32 s9, s21, s9 -; GCN-NEXT: s_mul_hi_u32 s22, s21, s16 +; GCN-NEXT: s_add_u32 s18, s18, s19 +; GCN-NEXT: s_addc_u32 s17, s17, s20 +; GCN-NEXT: s_mul_i32 s19, s9, s17 +; GCN-NEXT: s_mul_hi_u32 s20, s9, s18 +; GCN-NEXT: s_add_i32 s19, s20, s19 +; GCN-NEXT: s_mul_i32 s16, s16, s18 +; GCN-NEXT: s_add_i32 s19, s19, s16 +; GCN-NEXT: s_mul_i32 s9, s9, s18 +; GCN-NEXT: s_mul_hi_u32 s20, s17, s9 +; GCN-NEXT: s_mul_i32 s21, s17, s9 +; GCN-NEXT: s_mul_i32 s23, s18, s19 +; GCN-NEXT: s_mul_hi_u32 s9, s18, s9 +; GCN-NEXT: s_mul_hi_u32 s22, s18, s19 ; GCN-NEXT: s_add_u32 s9, s9, s23 ; GCN-NEXT: s_addc_u32 s22, 0, s22 -; GCN-NEXT: s_add_u32 s9, s9, s20 -; GCN-NEXT: s_mul_hi_u32 s17, s19, s16 -; GCN-NEXT: s_addc_u32 s9, s22, s18 -; GCN-NEXT: s_addc_u32 s17, s17, 0 -; GCN-NEXT: s_mul_i32 s16, s19, s16 -; GCN-NEXT: s_add_u32 s9, s9, s16 -; GCN-NEXT: s_addc_u32 s18, 0, s17 -; GCN-NEXT: s_add_u32 s9, s21, s9 -; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 -; GCN-NEXT: s_addc_u32 s20, s19, s18 +; GCN-NEXT: s_add_u32 s9, s9, s21 +; GCN-NEXT: s_mul_hi_u32 s16, s17, s19 +; GCN-NEXT: s_addc_u32 s9, s22, s20 +; GCN-NEXT: s_addc_u32 s16, s16, 0 +; GCN-NEXT: s_mul_i32 s19, s17, s19 +; GCN-NEXT: s_add_u32 s9, s9, s19 +; GCN-NEXT: s_addc_u32 s16, 0, s16 +; GCN-NEXT: s_add_u32 s9, s18, s9 +; GCN-NEXT: s_addc_u32 s20, s17, s16 ; GCN-NEXT: s_ashr_i32 s16, s11, 31 ; GCN-NEXT: s_add_u32 s18, s10, s16 ; GCN-NEXT: s_mov_b32 s17, s16 @@ -5344,11 +5292,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s9, s14, s9 ; GCN-NEXT: s_sub_u32 s9, s18, s9 ; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s18, s22, s15 ; GCN-NEXT: s_sub_u32 s24, s9, s14 ; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 ; GCN-NEXT: s_subb_u32 s25, s18, 0 ; GCN-NEXT: s_cmp_ge_u32 s25, s15 ; GCN-NEXT: s_cselect_b32 s26, -1, 0 @@ -5358,12 +5304,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s26, s27, s26 ; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 ; GCN-NEXT: s_subb_u32 s18, s18, s15 -; GCN-NEXT: s_sub_u32 s27, s24, s14 -; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 +; GCN-NEXT: s_sub_u32 s22, s24, s14 ; GCN-NEXT: s_subb_u32 s18, s18, 0 ; GCN-NEXT: s_cmp_lg_u32 s26, 0 -; GCN-NEXT: s_cselect_b32 s22, s27, s24 +; GCN-NEXT: s_cselect_b32 s22, s22, s24 ; GCN-NEXT: s_cselect_b32 s18, s18, s25 ; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s11, s19, s11 @@ -5420,7 +5364,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GCN-NEXT: s_sub_u32 s3, 0, s10 -; GCN-NEXT: s_subb_u32 s14, 0, s11 +; GCN-NEXT: s_subb_u32 s12, 0, s11 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -5429,56 +5373,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s15, v1 -; GCN-NEXT: v_readfirstlane_b32 s12, v0 -; GCN-NEXT: s_mul_i32 s13, s3, s15 -; GCN-NEXT: s_mul_hi_u32 s17, s3, s12 -; GCN-NEXT: s_mul_i32 s16, s14, s12 -; GCN-NEXT: s_add_i32 s13, s17, s13 -; GCN-NEXT: s_add_i32 s13, s13, s16 -; GCN-NEXT: s_mul_i32 s18, s3, s12 -; GCN-NEXT: s_mul_i32 s17, s12, s13 -; GCN-NEXT: s_mul_hi_u32 s19, s12, s18 -; GCN-NEXT: s_mul_hi_u32 s16, s12, s13 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-NEXT: s_mul_i32 s15, s3, s13 +; GCN-NEXT: s_mul_hi_u32 s17, s3, s14 +; GCN-NEXT: s_mul_i32 s16, s12, s14 +; GCN-NEXT: s_add_i32 s15, s17, s15 +; GCN-NEXT: s_add_i32 s15, s15, s16 +; GCN-NEXT: s_mul_i32 s18, s3, s14 +; GCN-NEXT: s_mul_i32 s17, s14, s15 +; GCN-NEXT: s_mul_hi_u32 s19, s14, s18 +; GCN-NEXT: s_mul_hi_u32 s16, s14, s15 ; GCN-NEXT: s_add_u32 s17, s19, s17 ; GCN-NEXT: s_addc_u32 s16, 0, s16 -; GCN-NEXT: s_mul_hi_u32 s20, s15, s18 -; GCN-NEXT: s_mul_i32 s18, s15, s18 +; GCN-NEXT: s_mul_hi_u32 s20, s13, s18 +; GCN-NEXT: s_mul_i32 s18, s13, s18 ; GCN-NEXT: s_add_u32 s17, s17, s18 -; GCN-NEXT: s_mul_hi_u32 s19, s15, s13 +; GCN-NEXT: s_mul_hi_u32 s19, s13, s15 ; GCN-NEXT: s_addc_u32 s16, s16, s20 ; GCN-NEXT: s_addc_u32 s17, s19, 0 -; GCN-NEXT: s_mul_i32 s13, s15, s13 -; GCN-NEXT: s_add_u32 s13, s16, s13 +; GCN-NEXT: s_mul_i32 s15, s13, s15 +; GCN-NEXT: s_add_u32 s15, s16, s15 ; GCN-NEXT: s_addc_u32 s16, 0, s17 -; GCN-NEXT: s_add_u32 s17, s12, s13 -; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GCN-NEXT: s_addc_u32 s15, s15, s16 -; GCN-NEXT: s_mul_i32 s12, s3, s15 -; GCN-NEXT: s_mul_hi_u32 s13, s3, s17 -; GCN-NEXT: s_add_i32 s12, s13, s12 -; GCN-NEXT: s_mul_i32 s14, s14, s17 -; GCN-NEXT: s_add_i32 s12, s12, s14 -; GCN-NEXT: s_mul_i32 s3, s3, s17 -; GCN-NEXT: s_mul_hi_u32 s14, s15, s3 -; GCN-NEXT: s_mul_i32 s16, s15, s3 -; GCN-NEXT: s_mul_i32 s19, s17, s12 -; GCN-NEXT: s_mul_hi_u32 s3, s17, s3 -; GCN-NEXT: s_mul_hi_u32 s18, s17, s12 +; GCN-NEXT: s_add_u32 s14, s14, s15 +; GCN-NEXT: s_addc_u32 s13, s13, s16 +; GCN-NEXT: s_mul_i32 s15, s3, s13 +; GCN-NEXT: s_mul_hi_u32 s16, s3, s14 +; GCN-NEXT: s_add_i32 s15, s16, s15 +; GCN-NEXT: s_mul_i32 s12, s12, s14 +; GCN-NEXT: s_add_i32 s15, s15, s12 +; GCN-NEXT: s_mul_i32 s3, s3, s14 +; GCN-NEXT: s_mul_hi_u32 s16, s13, s3 +; GCN-NEXT: s_mul_i32 s17, s13, s3 +; GCN-NEXT: s_mul_i32 s19, s14, s15 +; GCN-NEXT: s_mul_hi_u32 s3, s14, s3 +; GCN-NEXT: s_mul_hi_u32 s18, s14, s15 ; GCN-NEXT: s_add_u32 s3, s3, s19 ; GCN-NEXT: s_addc_u32 s18, 0, s18 -; GCN-NEXT: s_add_u32 s3, s3, s16 -; GCN-NEXT: s_mul_hi_u32 s13, s15, s12 -; GCN-NEXT: s_addc_u32 s3, s18, s14 -; GCN-NEXT: s_addc_u32 s13, s13, 0 -; GCN-NEXT: s_mul_i32 s12, s15, s12 -; GCN-NEXT: s_add_u32 s3, s3, s12 -; GCN-NEXT: s_addc_u32 s14, 0, s13 -; GCN-NEXT: s_add_u32 s3, s17, s3 -; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GCN-NEXT: s_addc_u32 s16, s15, s14 +; GCN-NEXT: s_add_u32 s3, s3, s17 +; GCN-NEXT: s_mul_hi_u32 s12, s13, s15 +; GCN-NEXT: s_addc_u32 s3, s18, s16 +; GCN-NEXT: s_addc_u32 s12, s12, 0 +; GCN-NEXT: s_mul_i32 s15, s13, s15 +; GCN-NEXT: s_add_u32 s3, s3, s15 +; GCN-NEXT: s_addc_u32 s12, 0, s12 +; GCN-NEXT: s_add_u32 s3, s14, s3 +; GCN-NEXT: s_addc_u32 s16, s13, s12 ; GCN-NEXT: s_ashr_i32 s12, s5, 31 ; GCN-NEXT: s_add_u32 s14, s4, s12 ; GCN-NEXT: s_mov_b32 s13, s12 @@ -5507,11 +5447,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s3, s10, s3 ; GCN-NEXT: s_sub_u32 s3, s14, s3 ; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s14, s18, s11 ; GCN-NEXT: s_sub_u32 s20, s3, s10 ; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s21, s14, 0 ; GCN-NEXT: s_cmp_ge_u32 s21, s11 ; GCN-NEXT: s_cselect_b32 s22, -1, 0 @@ -5521,12 +5459,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s22, s23, s22 ; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s14, s14, s11 -; GCN-NEXT: s_sub_u32 s23, s20, s10 -; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 +; GCN-NEXT: s_sub_u32 s18, s20, s10 ; GCN-NEXT: s_subb_u32 s14, s14, 0 ; GCN-NEXT: s_cmp_lg_u32 s22, 0 -; GCN-NEXT: s_cselect_b32 s18, s23, s20 +; GCN-NEXT: s_cselect_b32 s18, s18, s20 ; GCN-NEXT: s_cselect_b32 s14, s14, s21 ; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s5, s15, s5 @@ -6299,11 +6235,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_readfirstlane_b32 s14, v8 ; TONGA-NEXT: s_sub_u32 s12, s12, s14 ; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, s7 ; TONGA-NEXT: s_sub_u32 s18, s12, s6 ; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s19, s1, 0 ; TONGA-NEXT: s_cmp_ge_u32 s19, s7 ; TONGA-NEXT: s_cselect_b32 s20, -1, 0 @@ -6313,12 +6247,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: s_cselect_b32 s20, s21, s20 ; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, s7 -; TONGA-NEXT: s_sub_u32 s21, s18, s6 -; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 +; TONGA-NEXT: s_sub_u32 s16, s18, s6 ; TONGA-NEXT: s_subb_u32 s1, s1, 0 ; TONGA-NEXT: s_cmp_lg_u32 s20, 0 -; TONGA-NEXT: s_cselect_b32 s16, s21, s18 +; TONGA-NEXT: s_cselect_b32 s16, s16, s18 ; TONGA-NEXT: s_cselect_b32 s1, s1, s19 ; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s3, s13, s3 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index 33b0a5d..ea9bb04 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -51,10 +51,9 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_addc_u32 s13, 0, s14 ; GCN-NEXT: s_add_u32 s14, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s12, s12, s13 ; GCN-NEXT: s_mul_i32 s0, s10, s12 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 @@ -85,7 +84,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_add_u32 s11, s14, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s1, s12, s10 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 @@ -115,46 +113,43 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: v_readfirstlane_b32 s10, v0 ; GCN-NEXT: s_add_i32 s5, s10, s5 ; GCN-NEXT: s_mul_i32 s10, s9, s4 -; GCN-NEXT: s_add_i32 s10, s5, s10 -; GCN-NEXT: s_sub_i32 s11, s7, s10 +; GCN-NEXT: s_add_i32 s12, s5, s10 +; GCN-NEXT: s_sub_i32 s10, s7, s12 ; GCN-NEXT: s_mul_i32 s4, s8, s4 ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s12, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s12, 0 -; GCN-NEXT: s_subb_u32 s11, s11, s9 -; GCN-NEXT: s_sub_u32 s13, s6, s8 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_or_b32 s11, s4, s5 +; GCN-NEXT: s_subb_u32 s13, s10, s9 +; GCN-NEXT: s_sub_u32 s14, s6, s8 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_or_b32 s15, s10, s11 +; GCN-NEXT: s_subb_u32 s15, s13, 0 +; GCN-NEXT: s_cmp_ge_u32 s15, s9 +; GCN-NEXT: s_cselect_b32 s16, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s14, s8 +; GCN-NEXT: s_cselect_b32 s17, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s15, s9 +; GCN-NEXT: s_cselect_b32 s16, s17, s16 +; GCN-NEXT: s_or_b32 s10, s10, s11 +; GCN-NEXT: s_subb_u32 s13, s13, s9 +; GCN-NEXT: s_sub_u32 s17, s14, s8 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_or_b32 s10, s10, s11 +; GCN-NEXT: s_subb_u32 s10, s13, 0 +; GCN-NEXT: s_cmp_lg_u32 s16, 0 +; GCN-NEXT: s_cselect_b32 s11, s17, s14 +; GCN-NEXT: s_cselect_b32 s10, s10, s15 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_subb_u32 s14, s11, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s9 +; GCN-NEXT: s_subb_u32 s4, s7, s12 +; GCN-NEXT: s_cmp_ge_u32 s4, s9 ; GCN-NEXT: s_cselect_b32 s5, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s13, s8 -; GCN-NEXT: s_cselect_b32 s15, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s14, s9 -; GCN-NEXT: s_cselect_b32 s15, s15, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_subb_u32 s11, s11, s9 -; GCN-NEXT: s_sub_u32 s16, s13, s8 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_subb_u32 s4, s11, 0 -; GCN-NEXT: s_cmp_lg_u32 s15, 0 -; GCN-NEXT: s_cselect_b32 s5, s16, s13 -; GCN-NEXT: s_cselect_b32 s4, s4, s14 -; GCN-NEXT: s_cmp_lg_u32 s12, 0 -; GCN-NEXT: s_subb_u32 s7, s7, s10 -; GCN-NEXT: s_cmp_ge_u32 s7, s9 -; GCN-NEXT: s_cselect_b32 s10, -1, 0 ; GCN-NEXT: s_cmp_ge_u32 s6, s8 -; GCN-NEXT: s_cselect_b32 s8, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s7, s9 -; GCN-NEXT: s_cselect_b32 s8, s8, s10 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: s_cselect_b32 s4, s4, s7 -; GCN-NEXT: s_cselect_b32 s5, s5, s6 +; GCN-NEXT: s_cselect_b32 s7, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s4, s9 +; GCN-NEXT: s_cselect_b32 s5, s7, s5 +; GCN-NEXT: s_cmp_lg_u32 s5, 0 +; GCN-NEXT: s_cselect_b32 s4, s10, s4 +; GCN-NEXT: s_cselect_b32 s5, s11, s6 ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -187,7 +182,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_or_b32 s8, s8, s9 -; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-IR-NEXT: s_addc_u32 s8, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -221,7 +215,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_add_u32 s16, s16, 1 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_or_b32 s18, s18, s19 -; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0 ; GCN-IR-NEXT: s_addc_u32 s17, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] @@ -1016,10 +1009,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_addc_u32 s13, 0, s14 ; GCN-NEXT: s_add_u32 s14, s8, s9 ; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: s_addc_u32 s12, s12, s13 ; GCN-NEXT: s_mul_i32 s8, s10, s12 ; GCN-NEXT: v_readfirstlane_b32 s9, v0 @@ -1050,7 +1042,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_add_u32 s11, s14, s8 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: s_addc_u32 s10, s12, s10 ; GCN-NEXT: s_ashr_i32 s8, s7, 31 ; GCN-NEXT: s_add_u32 s6, s6, s8 @@ -1083,46 +1074,43 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_readfirstlane_b32 s12, v0 ; GCN-NEXT: s_add_i32 s11, s12, s11 ; GCN-NEXT: s_mul_i32 s12, s5, s10 -; GCN-NEXT: s_add_i32 s12, s11, s12 -; GCN-NEXT: s_sub_i32 s13, s7, s12 +; GCN-NEXT: s_add_i32 s14, s11, s12 +; GCN-NEXT: s_sub_i32 s12, s7, s14 ; GCN-NEXT: s_mul_i32 s10, s4, s10 ; GCN-NEXT: s_sub_u32 s6, s6, s10 ; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s14, s10, s11 -; GCN-NEXT: s_cmp_lg_u32 s14, 0 -; GCN-NEXT: s_subb_u32 s13, s13, s5 -; GCN-NEXT: s_sub_u32 s15, s6, s4 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_cmp_lg_u32 s10, 0 -; GCN-NEXT: s_subb_u32 s16, s13, 0 -; GCN-NEXT: s_cmp_ge_u32 s16, s5 -; GCN-NEXT: s_cselect_b32 s11, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s15, s4 -; GCN-NEXT: s_cselect_b32 s17, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s16, s5 -; GCN-NEXT: s_cselect_b32 s17, s17, s11 -; GCN-NEXT: s_cmp_lg_u32 s10, 0 -; GCN-NEXT: s_subb_u32 s13, s13, s5 -; GCN-NEXT: s_sub_u32 s18, s15, s4 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_or_b32 s13, s10, s11 +; GCN-NEXT: s_subb_u32 s15, s12, s5 +; GCN-NEXT: s_sub_u32 s16, s6, s4 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_or_b32 s17, s12, s13 +; GCN-NEXT: s_subb_u32 s17, s15, 0 +; GCN-NEXT: s_cmp_ge_u32 s17, s5 +; GCN-NEXT: s_cselect_b32 s18, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s16, s4 +; GCN-NEXT: s_cselect_b32 s19, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s17, s5 +; GCN-NEXT: s_cselect_b32 s18, s19, s18 +; GCN-NEXT: s_or_b32 s12, s12, s13 +; GCN-NEXT: s_subb_u32 s15, s15, s5 +; GCN-NEXT: s_sub_u32 s19, s16, s4 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_or_b32 s12, s12, s13 +; GCN-NEXT: s_subb_u32 s12, s15, 0 +; GCN-NEXT: s_cmp_lg_u32 s18, 0 +; GCN-NEXT: s_cselect_b32 s13, s19, s16 +; GCN-NEXT: s_cselect_b32 s12, s12, s17 ; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_cmp_lg_u32 s10, 0 -; GCN-NEXT: s_subb_u32 s10, s13, 0 -; GCN-NEXT: s_cmp_lg_u32 s17, 0 -; GCN-NEXT: s_cselect_b32 s11, s18, s15 -; GCN-NEXT: s_cselect_b32 s10, s10, s16 -; GCN-NEXT: s_cmp_lg_u32 s14, 0 -; GCN-NEXT: s_subb_u32 s7, s7, s12 +; GCN-NEXT: s_subb_u32 s7, s7, s14 ; GCN-NEXT: s_cmp_ge_u32 s7, s5 -; GCN-NEXT: s_cselect_b32 s12, -1, 0 +; GCN-NEXT: s_cselect_b32 s10, -1, 0 ; GCN-NEXT: s_cmp_ge_u32 s6, s4 ; GCN-NEXT: s_cselect_b32 s4, -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s7, s5 -; GCN-NEXT: s_cselect_b32 s4, s4, s12 +; GCN-NEXT: s_cselect_b32 s4, s4, s10 ; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_cselect_b32 s5, s10, s7 -; GCN-NEXT: s_cselect_b32 s4, s11, s6 +; GCN-NEXT: s_cselect_b32 s5, s12, s7 +; GCN-NEXT: s_cselect_b32 s4, s13, s6 ; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] ; GCN-NEXT: s_sub_u32 s4, s4, s8 ; GCN-NEXT: s_subb_u32 s5, s5, s8 @@ -1170,7 +1158,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_add_u32 s16, s14, 1 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_or_b32 s10, s10, s11 -; GCN-IR-NEXT: s_cmp_lg_u32 s10, 0 ; GCN-IR-NEXT: s_addc_u32 s10, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s14, 63, s14 @@ -1204,7 +1191,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_add_u32 s18, s18, 1 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_or_b32 s20, s20, s21 -; GCN-IR-NEXT: s_cmp_lg_u32 s20, 0 ; GCN-IR-NEXT: s_addc_u32 s19, s19, 0 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3] @@ -1369,10 +1355,9 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_addc_u32 s10, 0, s11 ; GCN-NEXT: s_add_u32 s11, s6, s7 ; GCN-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 +; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-NEXT: s_or_b32 s6, s6, s7 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_addc_u32 s9, s9, s10 ; GCN-NEXT: s_mul_i32 s6, s2, s9 ; GCN-NEXT: v_readfirstlane_b32 s7, v0 @@ -1403,7 +1388,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_add_u32 s2, s11, s2 ; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-NEXT: s_or_b32 s6, s6, s7 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_addc_u32 s6, s9, s8 ; GCN-NEXT: v_mul_hi_u32 v1, s2, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s6, 24 @@ -1418,45 +1402,42 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_mul_i32 s7, s5, s6 ; GCN-NEXT: s_mul_i32 s6, s4, s6 ; GCN-NEXT: v_readfirstlane_b32 s8, v0 -; GCN-NEXT: s_add_i32 s8, s8, s7 -; GCN-NEXT: s_sub_i32 s9, 0, s8 -; GCN-NEXT: s_sub_u32 s10, 24, s6 -; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-NEXT: s_or_b32 s11, s6, s7 -; GCN-NEXT: s_cmp_lg_u32 s11, 0 -; GCN-NEXT: s_subb_u32 s9, s9, s5 -; GCN-NEXT: s_sub_u32 s12, s10, s4 +; GCN-NEXT: s_add_i32 s10, s8, s7 +; GCN-NEXT: s_sub_i32 s8, 0, s10 +; GCN-NEXT: s_sub_u32 s11, 24, s6 ; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN-NEXT: s_or_b32 s9, s6, s7 +; GCN-NEXT: s_subb_u32 s12, s8, s5 +; GCN-NEXT: s_sub_u32 s13, s11, s4 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_or_b32 s14, s8, s9 +; GCN-NEXT: s_subb_u32 s14, s12, 0 +; GCN-NEXT: s_cmp_ge_u32 s14, s5 +; GCN-NEXT: s_cselect_b32 s15, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s13, s4 +; GCN-NEXT: s_cselect_b32 s16, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s14, s5 +; GCN-NEXT: s_cselect_b32 s15, s16, s15 +; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_subb_u32 s12, s12, s5 +; GCN-NEXT: s_sub_u32 s16, s13, s4 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_subb_u32 s8, s12, 0 +; GCN-NEXT: s_cmp_lg_u32 s15, 0 +; GCN-NEXT: s_cselect_b32 s9, s16, s13 +; GCN-NEXT: s_cselect_b32 s8, s8, s14 ; GCN-NEXT: s_or_b32 s6, s6, s7 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_subb_u32 s13, s9, 0 -; GCN-NEXT: s_cmp_ge_u32 s13, s5 +; GCN-NEXT: s_subb_u32 s6, 0, s10 +; GCN-NEXT: s_cmp_ge_u32 s6, s5 ; GCN-NEXT: s_cselect_b32 s7, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s12, s4 -; GCN-NEXT: s_cselect_b32 s14, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s13, s5 -; GCN-NEXT: s_cselect_b32 s14, s14, s7 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_subb_u32 s9, s9, s5 -; GCN-NEXT: s_sub_u32 s15, s12, s4 -; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-NEXT: s_or_b32 s6, s6, s7 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_subb_u32 s6, s9, 0 -; GCN-NEXT: s_cmp_lg_u32 s14, 0 -; GCN-NEXT: s_cselect_b32 s7, s15, s12 -; GCN-NEXT: s_cselect_b32 s6, s6, s13 -; GCN-NEXT: s_cmp_lg_u32 s11, 0 -; GCN-NEXT: s_subb_u32 s8, 0, s8 -; GCN-NEXT: s_cmp_ge_u32 s8, s5 -; GCN-NEXT: s_cselect_b32 s9, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s10, s4 +; GCN-NEXT: s_cmp_ge_u32 s11, s4 ; GCN-NEXT: s_cselect_b32 s4, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s8, s5 -; GCN-NEXT: s_cselect_b32 s4, s4, s9 +; GCN-NEXT: s_cmp_eq_u32 s6, s5 +; GCN-NEXT: s_cselect_b32 s4, s4, s7 ; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_cselect_b32 s4, s6, s8 -; GCN-NEXT: s_cselect_b32 s5, s7, s10 +; GCN-NEXT: s_cselect_b32 s4, s8, s6 +; GCN-NEXT: s_cselect_b32 s5, s9, s11 ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1489,7 +1470,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s8, s2, 1 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_or_b32 s9, s10, s11 -; GCN-IR-NEXT: s_cmp_lg_u32 s9, 0 ; GCN-IR-NEXT: s_addc_u32 s3, s3, 0 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s2, 63, s2 @@ -1522,7 +1502,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_or_b32 s16, s16, s17 -; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/stackguard.ll b/llvm/test/CodeGen/AMDGPU/stackguard.ll new file mode 100644 index 0000000..393686f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/stackguard.ll @@ -0,0 +1,14 @@ +; RUN: not llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s +; RUN: not llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s + +; FIXME: To actually support stackguard, need to fix intrinsic to +; return pointer in any address space. + +; CHECK: error: unable to lower stackguard +define i1 @test_stackguard(ptr %p1) { + %p2 = call ptr @llvm.stackguard() + %res = icmp ne ptr %p2, %p1 + ret i1 %res +} + +declare ptr @llvm.stackguard() diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll index bb5918b2..bdd22f25 100644 --- a/llvm/test/CodeGen/AMDGPU/uaddo.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll @@ -18,7 +18,6 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: s_or_b32 s0, s0, s1 -; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_addc_u32 s3, s3, s9 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -35,10 +34,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s2, s2, s4 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 ; VI-NEXT: s_addc_u32 s3, s3, s5 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -53,14 +50,12 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s6, s2, s6 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s4, s3, s7 +; GFX9-NEXT: s_add_u32 s4, s2, s6 +; GFX9-NEXT: s_addc_u32 s5, s3, s7 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm @@ -73,8 +68,6 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, s6 -; GFX10-NEXT: s_cselect_b32 s4, -1, 0 -; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_addc_u32 s3, s3, s7 ; GFX10-NEXT: s_cselect_b32 s4, -1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 @@ -91,14 +84,12 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s2, s2, s4 -; GFX11-NEXT: s_cselect_b32 s4, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_lg_u32 s4, 0 ; GFX11-NEXT: s_addc_u32 s3, s3, s5 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm @@ -444,7 +435,6 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_add_u32 s4, s4, s6 ; SI-NEXT: s_cselect_b64 s[12:13], -1, 0 ; SI-NEXT: s_or_b32 s6, s12, s13 -; SI-NEXT: s_cmp_lg_u32 s6, 0 ; SI-NEXT: s_addc_u32 s5, s5, s7 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 @@ -465,16 +455,14 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: s_add_u32 s2, s4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_add_u32 s0, s4, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_addc_u32 s1, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 -; VI-NEXT: s_addc_u32 s0, s5, s7 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_mov_b32_e32 v5, s0 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -486,12 +474,10 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s2, s12, s14 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_addc_u32 s0, s13, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s12, s14 +; GFX9-NEXT: s_addc_u32 s1, s13, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -504,10 +490,8 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s12, s14 -; GFX10-NEXT: s_cselect_b32 s1, -1, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: s_addc_u32 s1, s13, s15 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_cselect_b32 s0, -1, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -520,10 +504,8 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s4, s4, s6 -; GFX11-NEXT: s_cselect_b32 s6, -1, 0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_cmp_lg_u32 s6, 0 ; GFX11-NEXT: s_addc_u32 s5, s5, s7 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index 41199b0..fd461ac 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -148,7 +148,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_or_b32 s8, s8, s9 -; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-IR-NEXT: s_addc_u32 s8, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -182,7 +181,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_or_b32 s16, s16, s17 -; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[2:3], s[4:5] @@ -831,10 +829,9 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_addc_u32 s10, 0, s11 ; GCN-NEXT: s_add_u32 s11, s4, s5 ; GCN-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_addc_u32 s9, s9, s10 ; GCN-NEXT: s_mul_i32 s4, s6, s9 ; GCN-NEXT: v_readfirstlane_b32 s5, v0 @@ -865,7 +862,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_add_u32 s8, s11, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_addc_u32 s4, s9, s6 ; GCN-NEXT: v_mul_hi_u32 v1, s8, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s4, 24 @@ -874,52 +870,50 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_readfirstlane_b32 s8, v1 ; GCN-NEXT: v_readfirstlane_b32 s5, v0 ; GCN-NEXT: s_add_u32 s4, s8, s4 -; GCN-NEXT: s_addc_u32 s8, 0, s5 -; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: s_addc_u32 s10, 0, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_mul_i32 s0, s3, s8 +; GCN-NEXT: s_mul_i32 s0, s3, s10 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_add_i32 s9, s1, s0 -; GCN-NEXT: s_sub_i32 s10, 0, s9 -; GCN-NEXT: s_mul_i32 s0, s2, s8 -; GCN-NEXT: s_sub_u32 s11, 24, s0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s12, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s12, 0 -; GCN-NEXT: s_subb_u32 s10, s10, s3 -; GCN-NEXT: s_sub_u32 s13, s11, s2 +; GCN-NEXT: s_add_i32 s11, s1, s0 +; GCN-NEXT: s_sub_i32 s8, 0, s11 +; GCN-NEXT: s_mul_i32 s0, s2, s10 +; GCN-NEXT: s_sub_u32 s12, 24, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_or_b32 s9, s0, s1 +; GCN-NEXT: s_subb_u32 s13, s8, s3 +; GCN-NEXT: s_sub_u32 s14, s12, s2 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_subb_u32 s8, s13, 0 +; GCN-NEXT: s_cmp_ge_u32 s8, s3 +; GCN-NEXT: s_cselect_b32 s9, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s14, s2 +; GCN-NEXT: s_cselect_b32 s13, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, s3 +; GCN-NEXT: s_cselect_b32 s8, s13, s9 +; GCN-NEXT: s_add_u32 s9, s10, 1 +; GCN-NEXT: s_addc_u32 s13, 0, 0 +; GCN-NEXT: s_add_u32 s14, s10, 2 +; GCN-NEXT: s_addc_u32 s15, 0, 0 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cselect_b32 s8, s14, s9 +; GCN-NEXT: s_cselect_b32 s9, s15, s13 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_subb_u32 s0, s10, 0 +; GCN-NEXT: s_subb_u32 s0, 0, s11 ; GCN-NEXT: s_cmp_ge_u32 s0, s3 ; GCN-NEXT: s_cselect_b32 s1, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s13, s2 -; GCN-NEXT: s_cselect_b32 s10, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s12, s2 +; GCN-NEXT: s_cselect_b32 s2, -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s0, s3 -; GCN-NEXT: s_cselect_b32 s0, s10, s1 -; GCN-NEXT: s_add_u32 s1, s8, 1 -; GCN-NEXT: s_addc_u32 s10, 0, 0 -; GCN-NEXT: s_add_u32 s13, s8, 2 -; GCN-NEXT: s_addc_u32 s14, 0, 0 +; GCN-NEXT: s_cselect_b32 s0, s2, s1 ; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_cselect_b32 s0, s13, s1 -; GCN-NEXT: s_cselect_b32 s1, s14, s10 -; GCN-NEXT: s_cmp_lg_u32 s12, 0 -; GCN-NEXT: s_subb_u32 s9, 0, s9 -; GCN-NEXT: s_cmp_ge_u32 s9, s3 -; GCN-NEXT: s_cselect_b32 s10, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s11, s2 -; GCN-NEXT: s_cselect_b32 s2, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s9, s3 -; GCN-NEXT: s_cselect_b32 s2, s2, s10 -; GCN-NEXT: s_cmp_lg_u32 s2, 0 -; GCN-NEXT: s_cselect_b32 s1, s1, 0 -; GCN-NEXT: s_cselect_b32 s0, s0, s8 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: s_cselect_b32 s0, s9, 0 +; GCN-NEXT: s_cselect_b32 s1, s8, s10 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; @@ -945,7 +939,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s10, s8, 1 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_or_b32 s6, s6, s7 -; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -978,7 +971,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_or_b32 s16, s16, s17 -; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5] @@ -1317,7 +1309,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s11, s8, 1 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_or_b32 s6, s6, s7 -; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -1347,7 +1338,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 ; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GCN-IR-NEXT: s_or_b32 s12, s12, s13 -; GCN-IR-NEXT: s_cmp_lg_u32 s12, 0 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll index 9bcba6c..2d7ce10 100644 --- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s define amdgpu_kernel void @uitofp_i16_to_f16( ; SI-LABEL: uitofp_i16_to_f16: diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index cdcc914..137dc1f 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -51,10 +51,9 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: s_addc_u32 s13, 0, s14 ; GCN-NEXT: s_add_u32 s14, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s12, s12, s13 ; GCN-NEXT: s_mul_i32 s0, s10, s12 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 @@ -85,7 +84,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: s_add_u32 s11, s14, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s1, s12, s10 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 @@ -115,46 +113,43 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: v_readfirstlane_b32 s10, v0 ; GCN-NEXT: s_add_i32 s5, s10, s5 ; GCN-NEXT: s_mul_i32 s10, s9, s4 -; GCN-NEXT: s_add_i32 s10, s5, s10 -; GCN-NEXT: s_sub_i32 s11, s7, s10 +; GCN-NEXT: s_add_i32 s12, s5, s10 +; GCN-NEXT: s_sub_i32 s10, s7, s12 ; GCN-NEXT: s_mul_i32 s4, s8, s4 ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s12, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s12, 0 -; GCN-NEXT: s_subb_u32 s11, s11, s9 -; GCN-NEXT: s_sub_u32 s13, s6, s8 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_or_b32 s11, s4, s5 +; GCN-NEXT: s_subb_u32 s13, s10, s9 +; GCN-NEXT: s_sub_u32 s14, s6, s8 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_or_b32 s15, s10, s11 +; GCN-NEXT: s_subb_u32 s15, s13, 0 +; GCN-NEXT: s_cmp_ge_u32 s15, s9 +; GCN-NEXT: s_cselect_b32 s16, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s14, s8 +; GCN-NEXT: s_cselect_b32 s17, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s15, s9 +; GCN-NEXT: s_cselect_b32 s16, s17, s16 +; GCN-NEXT: s_or_b32 s10, s10, s11 +; GCN-NEXT: s_subb_u32 s13, s13, s9 +; GCN-NEXT: s_sub_u32 s17, s14, s8 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_or_b32 s10, s10, s11 +; GCN-NEXT: s_subb_u32 s10, s13, 0 +; GCN-NEXT: s_cmp_lg_u32 s16, 0 +; GCN-NEXT: s_cselect_b32 s11, s17, s14 +; GCN-NEXT: s_cselect_b32 s10, s10, s15 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_subb_u32 s14, s11, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s9 +; GCN-NEXT: s_subb_u32 s4, s7, s12 +; GCN-NEXT: s_cmp_ge_u32 s4, s9 ; GCN-NEXT: s_cselect_b32 s5, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s13, s8 -; GCN-NEXT: s_cselect_b32 s15, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s14, s9 -; GCN-NEXT: s_cselect_b32 s15, s15, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_subb_u32 s11, s11, s9 -; GCN-NEXT: s_sub_u32 s16, s13, s8 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_subb_u32 s4, s11, 0 -; GCN-NEXT: s_cmp_lg_u32 s15, 0 -; GCN-NEXT: s_cselect_b32 s5, s16, s13 -; GCN-NEXT: s_cselect_b32 s4, s4, s14 -; GCN-NEXT: s_cmp_lg_u32 s12, 0 -; GCN-NEXT: s_subb_u32 s7, s7, s10 -; GCN-NEXT: s_cmp_ge_u32 s7, s9 -; GCN-NEXT: s_cselect_b32 s10, -1, 0 ; GCN-NEXT: s_cmp_ge_u32 s6, s8 -; GCN-NEXT: s_cselect_b32 s8, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s7, s9 -; GCN-NEXT: s_cselect_b32 s8, s8, s10 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: s_cselect_b32 s4, s4, s7 -; GCN-NEXT: s_cselect_b32 s5, s5, s6 +; GCN-NEXT: s_cselect_b32 s7, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s4, s9 +; GCN-NEXT: s_cselect_b32 s5, s7, s5 +; GCN-NEXT: s_cmp_lg_u32 s5, 0 +; GCN-NEXT: s_cselect_b32 s4, s10, s4 +; GCN-NEXT: s_cselect_b32 s5, s11, s6 ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -187,7 +182,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_or_b32 s8, s8, s9 -; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-IR-NEXT: s_addc_u32 s8, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -221,7 +215,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_add_u32 s16, s16, 1 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_or_b32 s18, s18, s19 -; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0 ; GCN-IR-NEXT: s_addc_u32 s17, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] @@ -853,10 +846,9 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_addc_u32 s10, 0, s11 ; GCN-NEXT: s_add_u32 s11, s4, s5 ; GCN-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_addc_u32 s9, s9, s10 ; GCN-NEXT: s_mul_i32 s4, s6, s9 ; GCN-NEXT: v_readfirstlane_b32 s5, v0 @@ -887,7 +879,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_add_u32 s8, s11, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_addc_u32 s4, s9, s6 ; GCN-NEXT: v_mul_hi_u32 v1, s8, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s4, 24 @@ -903,46 +894,43 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_mul_i32 s0, s3, s8 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_add_i32 s9, s1, s0 -; GCN-NEXT: s_sub_i32 s10, 0, s9 +; GCN-NEXT: s_add_i32 s10, s1, s0 +; GCN-NEXT: s_sub_i32 s9, 0, s10 ; GCN-NEXT: s_mul_i32 s0, s2, s8 -; GCN-NEXT: s_sub_u32 s8, 24, s0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s11, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s11, 0 -; GCN-NEXT: s_subb_u32 s10, s10, s3 -; GCN-NEXT: s_sub_u32 s12, s8, s2 +; GCN-NEXT: s_sub_u32 s11, 24, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_or_b32 s8, s0, s1 +; GCN-NEXT: s_subb_u32 s12, s9, s3 +; GCN-NEXT: s_sub_u32 s13, s11, s2 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_or_b32 s14, s8, s9 +; GCN-NEXT: s_subb_u32 s14, s12, 0 +; GCN-NEXT: s_cmp_ge_u32 s14, s3 +; GCN-NEXT: s_cselect_b32 s15, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s13, s2 +; GCN-NEXT: s_cselect_b32 s16, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s14, s3 +; GCN-NEXT: s_cselect_b32 s15, s16, s15 +; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_subb_u32 s12, s12, s3 +; GCN-NEXT: s_sub_u32 s16, s13, s2 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_subb_u32 s8, s12, 0 +; GCN-NEXT: s_cmp_lg_u32 s15, 0 +; GCN-NEXT: s_cselect_b32 s9, s16, s13 +; GCN-NEXT: s_cselect_b32 s8, s8, s14 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_subb_u32 s13, s10, 0 -; GCN-NEXT: s_cmp_ge_u32 s13, s3 +; GCN-NEXT: s_subb_u32 s0, 0, s10 +; GCN-NEXT: s_cmp_ge_u32 s0, s3 ; GCN-NEXT: s_cselect_b32 s1, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s12, s2 -; GCN-NEXT: s_cselect_b32 s14, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s13, s3 -; GCN-NEXT: s_cselect_b32 s14, s14, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_subb_u32 s10, s10, s3 -; GCN-NEXT: s_sub_u32 s15, s12, s2 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_subb_u32 s0, s10, 0 -; GCN-NEXT: s_cmp_lg_u32 s14, 0 -; GCN-NEXT: s_cselect_b32 s1, s15, s12 -; GCN-NEXT: s_cselect_b32 s0, s0, s13 -; GCN-NEXT: s_cmp_lg_u32 s11, 0 -; GCN-NEXT: s_subb_u32 s9, 0, s9 -; GCN-NEXT: s_cmp_ge_u32 s9, s3 -; GCN-NEXT: s_cselect_b32 s10, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s8, s2 +; GCN-NEXT: s_cmp_ge_u32 s11, s2 ; GCN-NEXT: s_cselect_b32 s2, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s9, s3 -; GCN-NEXT: s_cselect_b32 s2, s2, s10 -; GCN-NEXT: s_cmp_lg_u32 s2, 0 -; GCN-NEXT: s_cselect_b32 s0, s0, s9 -; GCN-NEXT: s_cselect_b32 s1, s1, s8 +; GCN-NEXT: s_cmp_eq_u32 s0, s3 +; GCN-NEXT: s_cselect_b32 s1, s2, s1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s8, s0 +; GCN-NEXT: s_cselect_b32 s1, s9, s11 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -970,7 +958,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s10, s8, 1 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_or_b32 s6, s6, s7 -; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -1003,7 +990,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_or_b32 s16, s16, s17 -; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5] @@ -1093,7 +1079,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s11, s8, 1 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_or_b32 s6, s6, s7 -; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -1123,7 +1108,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s12, s12, 1 ; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0 ; GCN-IR-NEXT: s_or_b32 s14, s14, s15 -; GCN-IR-NEXT: s_cmp_lg_u32 s14, 0 ; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll index d67a7b1..e8db647 100644 --- a/llvm/test/CodeGen/AMDGPU/usubo.ll +++ b/llvm/test/CodeGen/AMDGPU/usubo.ll @@ -18,7 +18,6 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: s_or_b32 s0, s0, s1 -; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_subb_u32 s3, s3, s9 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -35,10 +34,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_sub_u32 s2, s2, s4 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 ; VI-NEXT: s_subb_u32 s3, s3, s5 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -53,14 +50,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s6, s2, s6 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_subb_u32 s4, s3, s7 +; GFX9-NEXT: s_sub_u32 s4, s2, s6 +; GFX9-NEXT: s_subb_u32 s5, s3, s7 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm @@ -73,8 +68,6 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sub_u32 s2, s2, s6 -; GFX10-NEXT: s_cselect_b32 s4, -1, 0 -; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_subb_u32 s3, s3, s7 ; GFX10-NEXT: s_cselect_b32 s4, -1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 @@ -91,14 +84,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s2, s2, s4 -; GFX11-NEXT: s_cselect_b32 s4, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_lg_u32 s4, 0 ; GFX11-NEXT: s_subb_u32 s3, s3, s5 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm @@ -443,7 +434,6 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_sub_u32 s4, s4, s6 ; SI-NEXT: s_cselect_b64 s[12:13], -1, 0 ; SI-NEXT: s_or_b32 s6, s12, s13 -; SI-NEXT: s_cmp_lg_u32 s6, 0 ; SI-NEXT: s_subb_u32 s5, s5, s7 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 @@ -464,16 +454,14 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: s_sub_u32 s2, s4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_sub_u32 s0, s4, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_subb_u32 s1, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 -; VI-NEXT: s_subb_u32 s0, s5, s7 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_mov_b32_e32 v5, s0 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -485,12 +473,10 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s2, s12, s14 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_subb_u32 s0, s13, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_sub_u32 s0, s12, s14 +; GFX9-NEXT: s_subb_u32 s1, s13, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -503,10 +489,8 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sub_u32 s0, s12, s14 -; GFX10-NEXT: s_cselect_b32 s1, -1, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: s_subb_u32 s1, s13, s15 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_cselect_b32 s0, -1, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -519,10 +503,8 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s4, s4, s6 -; GFX11-NEXT: s_cselect_b32 s6, -1, 0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_cmp_lg_u32 s6, 0 ; GFX11-NEXT: s_subb_u32 s5, s5, s7 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 75db387..28c6b40 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -774,44 +774,40 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_add_u32 s11, s12, s11 ; GFX1032-NEXT: s_addc_u32 s12, 0, s13 ; GFX1032-NEXT: s_add_u32 s8, s8, s11 -; GFX1032-NEXT: s_cselect_b32 s11, -1, 0 -; GFX1032-NEXT: s_mul_hi_u32 s13, s9, s8 -; GFX1032-NEXT: s_cmp_lg_u32 s11, 0 -; GFX1032-NEXT: s_mul_i32 s11, s9, s8 ; GFX1032-NEXT: s_addc_u32 s5, s5, s12 -; GFX1032-NEXT: s_mul_i32 s10, s10, s8 +; GFX1032-NEXT: s_mul_hi_u32 s11, s9, s8 +; GFX1032-NEXT: s_mul_i32 s12, s9, s8 ; GFX1032-NEXT: s_mul_i32 s9, s9, s5 -; GFX1032-NEXT: s_mul_hi_u32 s12, s8, s11 -; GFX1032-NEXT: s_add_i32 s9, s13, s9 -; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s11 +; GFX1032-NEXT: s_mul_i32 s10, s10, s8 +; GFX1032-NEXT: s_add_i32 s9, s11, s9 +; GFX1032-NEXT: s_mul_i32 s11, s5, s12 ; GFX1032-NEXT: s_add_i32 s9, s9, s10 -; GFX1032-NEXT: s_mul_i32 s10, s5, s11 +; GFX1032-NEXT: s_mul_hi_u32 s10, s8, s12 ; GFX1032-NEXT: s_mul_i32 s15, s8, s9 ; GFX1032-NEXT: s_mul_hi_u32 s14, s8, s9 -; GFX1032-NEXT: s_add_u32 s12, s12, s15 +; GFX1032-NEXT: s_add_u32 s10, s10, s15 +; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s12 ; GFX1032-NEXT: s_addc_u32 s14, 0, s14 -; GFX1032-NEXT: s_mul_hi_u32 s11, s5, s9 -; GFX1032-NEXT: s_add_u32 s10, s12, s10 +; GFX1032-NEXT: s_mul_hi_u32 s12, s5, s9 +; GFX1032-NEXT: s_add_u32 s10, s10, s11 ; GFX1032-NEXT: s_mul_i32 s9, s5, s9 ; GFX1032-NEXT: s_addc_u32 s10, s14, s13 -; GFX1032-NEXT: s_addc_u32 s11, s11, 0 +; GFX1032-NEXT: s_addc_u32 s11, s12, 0 ; GFX1032-NEXT: s_add_u32 s9, s10, s9 ; GFX1032-NEXT: s_addc_u32 s10, 0, s11 ; GFX1032-NEXT: s_add_u32 s8, s8, s9 -; GFX1032-NEXT: s_cselect_b32 s9, -1, 0 -; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s8 -; GFX1032-NEXT: s_cmp_lg_u32 s9, 0 -; GFX1032-NEXT: s_mul_hi_u32 s9, s3, s8 ; GFX1032-NEXT: s_addc_u32 s5, s5, s10 -; GFX1032-NEXT: s_mul_i32 s8, s3, s8 +; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s8 ; GFX1032-NEXT: s_mul_i32 s12, s2, s5 -; GFX1032-NEXT: s_mul_hi_u32 s10, s2, s5 -; GFX1032-NEXT: s_add_u32 s11, s11, s12 -; GFX1032-NEXT: s_addc_u32 s10, 0, s10 +; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s5 +; GFX1032-NEXT: s_mul_hi_u32 s10, s3, s8 +; GFX1032-NEXT: s_mul_i32 s8, s3, s8 +; GFX1032-NEXT: s_add_u32 s9, s9, s12 +; GFX1032-NEXT: s_addc_u32 s11, 0, s11 ; GFX1032-NEXT: s_mul_hi_u32 s13, s3, s5 -; GFX1032-NEXT: s_add_u32 s8, s11, s8 +; GFX1032-NEXT: s_add_u32 s8, s9, s8 ; GFX1032-NEXT: s_mul_i32 s5, s3, s5 -; GFX1032-NEXT: s_addc_u32 s8, s10, s9 +; GFX1032-NEXT: s_addc_u32 s8, s11, s10 ; GFX1032-NEXT: s_addc_u32 s9, s13, 0 ; GFX1032-NEXT: s_add_u32 s5, s8, s5 ; GFX1032-NEXT: s_addc_u32 s8, 0, s9 @@ -824,11 +820,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_sub_i32 s11, s3, s9 ; GFX1032-NEXT: s_sub_u32 s10, s2, s10 ; GFX1032-NEXT: s_cselect_b32 s12, -1, 0 -; GFX1032-NEXT: s_cmp_lg_u32 s12, 0 ; GFX1032-NEXT: s_subb_u32 s11, s11, s1 ; GFX1032-NEXT: s_sub_u32 s13, s10, s0 -; GFX1032-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1032-NEXT: s_cmp_lg_u32 s14, 0 ; GFX1032-NEXT: s_subb_u32 s11, s11, 0 ; GFX1032-NEXT: s_cmp_ge_u32 s11, s1 ; GFX1032-NEXT: s_cselect_b32 s14, -1, 0 @@ -901,8 +894,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX1064-NEXT: s_sub_u32 s9, 0, s0 -; GFX1064-NEXT: s_subb_u32 s10, 0, s1 +; GFX1064-NEXT: s_sub_u32 s8, 0, s0 +; GFX1064-NEXT: s_subb_u32 s9, 0, s1 ; GFX1064-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX1064-NEXT: v_rcp_f32_e32 v0, v0 ; GFX1064-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -911,109 +904,102 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX1064-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1064-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1064-NEXT: v_readfirstlane_b32 s8, v1 -; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 -; GFX1064-NEXT: s_mul_i32 s5, s9, s8 -; GFX1064-NEXT: s_mul_hi_u32 s12, s9, s4 -; GFX1064-NEXT: s_mul_i32 s11, s10, s4 -; GFX1064-NEXT: s_add_i32 s5, s12, s5 -; GFX1064-NEXT: s_mul_i32 s13, s9, s4 -; GFX1064-NEXT: s_add_i32 s5, s5, s11 -; GFX1064-NEXT: s_mul_hi_u32 s12, s4, s13 -; GFX1064-NEXT: s_mul_i32 s15, s4, s5 -; GFX1064-NEXT: s_mul_hi_u32 s14, s8, s13 -; GFX1064-NEXT: s_mul_i32 s11, s8, s13 -; GFX1064-NEXT: s_mul_hi_u32 s13, s4, s5 +; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s5, v0 +; GFX1064-NEXT: s_mul_i32 s10, s8, s4 +; GFX1064-NEXT: s_mul_hi_u32 s12, s8, s5 +; GFX1064-NEXT: s_mul_i32 s11, s9, s5 +; GFX1064-NEXT: s_add_i32 s10, s12, s10 +; GFX1064-NEXT: s_mul_i32 s13, s8, s5 +; GFX1064-NEXT: s_add_i32 s10, s10, s11 +; GFX1064-NEXT: s_mul_hi_u32 s12, s5, s13 +; GFX1064-NEXT: s_mul_i32 s15, s5, s10 +; GFX1064-NEXT: s_mul_hi_u32 s14, s4, s13 +; GFX1064-NEXT: s_mul_i32 s11, s4, s13 +; GFX1064-NEXT: s_mul_hi_u32 s13, s5, s10 ; GFX1064-NEXT: s_add_u32 s12, s12, s15 ; GFX1064-NEXT: s_addc_u32 s13, 0, s13 -; GFX1064-NEXT: s_mul_hi_u32 s16, s8, s5 +; GFX1064-NEXT: s_mul_hi_u32 s16, s4, s10 ; GFX1064-NEXT: s_add_u32 s11, s12, s11 -; GFX1064-NEXT: s_mul_i32 s5, s8, s5 +; GFX1064-NEXT: s_mul_i32 s10, s4, s10 ; GFX1064-NEXT: s_addc_u32 s11, s13, s14 ; GFX1064-NEXT: s_addc_u32 s12, s16, 0 -; GFX1064-NEXT: s_add_u32 s5, s11, s5 +; GFX1064-NEXT: s_add_u32 s10, s11, s10 ; GFX1064-NEXT: s_addc_u32 s11, 0, s12 -; GFX1064-NEXT: s_add_u32 s12, s4, s5 -; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX1064-NEXT: s_mul_hi_u32 s13, s9, s12 -; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX1064-NEXT: s_mul_i32 s4, s9, s12 -; GFX1064-NEXT: s_addc_u32 s8, s8, s11 -; GFX1064-NEXT: s_mul_i32 s10, s10, s12 -; GFX1064-NEXT: s_mul_i32 s9, s9, s8 -; GFX1064-NEXT: s_mul_hi_u32 s5, s12, s4 -; GFX1064-NEXT: s_add_i32 s9, s13, s9 -; GFX1064-NEXT: s_mul_hi_u32 s11, s8, s4 -; GFX1064-NEXT: s_add_i32 s9, s9, s10 -; GFX1064-NEXT: s_mul_i32 s4, s8, s4 -; GFX1064-NEXT: s_mul_i32 s14, s12, s9 -; GFX1064-NEXT: s_mul_hi_u32 s13, s12, s9 -; GFX1064-NEXT: s_add_u32 s5, s5, s14 +; GFX1064-NEXT: s_add_u32 s5, s5, s10 +; GFX1064-NEXT: s_addc_u32 s4, s4, s11 +; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s5 +; GFX1064-NEXT: s_mul_i32 s11, s8, s5 +; GFX1064-NEXT: s_mul_i32 s8, s8, s4 +; GFX1064-NEXT: s_mul_i32 s9, s9, s5 +; GFX1064-NEXT: s_add_i32 s8, s10, s8 +; GFX1064-NEXT: s_mul_i32 s10, s4, s11 +; GFX1064-NEXT: s_add_i32 s8, s8, s9 +; GFX1064-NEXT: s_mul_hi_u32 s9, s5, s11 +; GFX1064-NEXT: s_mul_i32 s14, s5, s8 +; GFX1064-NEXT: s_mul_hi_u32 s13, s5, s8 +; GFX1064-NEXT: s_add_u32 s9, s9, s14 +; GFX1064-NEXT: s_mul_hi_u32 s12, s4, s11 ; GFX1064-NEXT: s_addc_u32 s13, 0, s13 -; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s9 -; GFX1064-NEXT: s_add_u32 s4, s5, s4 -; GFX1064-NEXT: s_mul_i32 s9, s8, s9 -; GFX1064-NEXT: s_addc_u32 s4, s13, s11 -; GFX1064-NEXT: s_addc_u32 s5, s10, 0 -; GFX1064-NEXT: s_add_u32 s4, s4, s9 -; GFX1064-NEXT: s_addc_u32 s9, 0, s5 -; GFX1064-NEXT: s_add_u32 s10, s12, s4 -; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX1064-NEXT: s_mul_hi_u32 s11, s2, s10 -; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX1064-NEXT: s_mul_hi_u32 s4, s3, s10 -; GFX1064-NEXT: s_addc_u32 s5, s8, s9 -; GFX1064-NEXT: s_mul_i32 s8, s3, s10 -; GFX1064-NEXT: s_mul_i32 s10, s2, s5 -; GFX1064-NEXT: s_mul_hi_u32 s9, s2, s5 -; GFX1064-NEXT: s_add_u32 s10, s11, s10 -; GFX1064-NEXT: s_addc_u32 s9, 0, s9 -; GFX1064-NEXT: s_mul_hi_u32 s12, s3, s5 -; GFX1064-NEXT: s_add_u32 s8, s10, s8 +; GFX1064-NEXT: s_mul_hi_u32 s11, s4, s8 +; GFX1064-NEXT: s_add_u32 s9, s9, s10 +; GFX1064-NEXT: s_mul_i32 s8, s4, s8 +; GFX1064-NEXT: s_addc_u32 s9, s13, s12 +; GFX1064-NEXT: s_addc_u32 s10, s11, 0 +; GFX1064-NEXT: s_add_u32 s8, s9, s8 +; GFX1064-NEXT: s_addc_u32 s9, 0, s10 +; GFX1064-NEXT: s_add_u32 s5, s5, s8 +; GFX1064-NEXT: s_addc_u32 s4, s4, s9 +; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s5 +; GFX1064-NEXT: s_mul_i32 s11, s2, s4 +; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s4 +; GFX1064-NEXT: s_mul_hi_u32 s9, s3, s5 ; GFX1064-NEXT: s_mul_i32 s5, s3, s5 -; GFX1064-NEXT: s_addc_u32 s4, s9, s4 +; GFX1064-NEXT: s_add_u32 s8, s8, s11 +; GFX1064-NEXT: s_addc_u32 s10, 0, s10 +; GFX1064-NEXT: s_mul_hi_u32 s12, s3, s4 +; GFX1064-NEXT: s_add_u32 s5, s8, s5 +; GFX1064-NEXT: s_mul_i32 s4, s3, s4 +; GFX1064-NEXT: s_addc_u32 s5, s10, s9 ; GFX1064-NEXT: s_addc_u32 s8, s12, 0 -; GFX1064-NEXT: s_add_u32 s10, s4, s5 +; GFX1064-NEXT: s_add_u32 s10, s5, s4 ; GFX1064-NEXT: s_addc_u32 s11, 0, s8 ; GFX1064-NEXT: s_mul_hi_u32 s4, s0, s10 ; GFX1064-NEXT: s_mul_i32 s5, s0, s11 ; GFX1064-NEXT: s_mul_i32 s8, s1, s10 ; GFX1064-NEXT: s_add_i32 s4, s4, s5 -; GFX1064-NEXT: s_add_i32 s12, s4, s8 +; GFX1064-NEXT: s_add_i32 s8, s4, s8 ; GFX1064-NEXT: s_mul_i32 s4, s0, s10 -; GFX1064-NEXT: s_sub_i32 s8, s3, s12 -; GFX1064-NEXT: s_sub_u32 s13, s2, s4 +; GFX1064-NEXT: s_sub_i32 s9, s3, s8 +; GFX1064-NEXT: s_sub_u32 s12, s2, s4 ; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX1064-NEXT: s_subb_u32 s14, s8, s1 -; GFX1064-NEXT: s_sub_u32 s15, s13, s0 -; GFX1064-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX1064-NEXT: s_cmp_lg_u64 s[8:9], 0 -; GFX1064-NEXT: s_subb_u32 s8, s14, 0 -; GFX1064-NEXT: s_cmp_ge_u32 s8, s1 -; GFX1064-NEXT: s_cselect_b32 s9, -1, 0 -; GFX1064-NEXT: s_cmp_ge_u32 s15, s0 +; GFX1064-NEXT: s_subb_u32 s9, s9, s1 +; GFX1064-NEXT: s_sub_u32 s13, s12, s0 +; GFX1064-NEXT: s_subb_u32 s9, s9, 0 +; GFX1064-NEXT: s_cmp_ge_u32 s9, s1 ; GFX1064-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1064-NEXT: s_cmp_eq_u32 s8, s1 -; GFX1064-NEXT: s_cselect_b32 s8, s14, s9 -; GFX1064-NEXT: s_add_u32 s9, s10, 1 +; GFX1064-NEXT: s_cmp_ge_u32 s13, s0 +; GFX1064-NEXT: s_cselect_b32 s13, -1, 0 +; GFX1064-NEXT: s_cmp_eq_u32 s9, s1 +; GFX1064-NEXT: s_cselect_b32 s9, s13, s14 +; GFX1064-NEXT: s_add_u32 s13, s10, 1 ; GFX1064-NEXT: s_addc_u32 s14, s11, 0 ; GFX1064-NEXT: s_add_u32 s15, s10, 2 ; GFX1064-NEXT: s_addc_u32 s16, s11, 0 -; GFX1064-NEXT: s_cmp_lg_u32 s8, 0 -; GFX1064-NEXT: s_cselect_b32 s15, s15, s9 +; GFX1064-NEXT: s_cmp_lg_u32 s9, 0 +; GFX1064-NEXT: s_cselect_b32 s13, s15, s13 ; GFX1064-NEXT: s_cselect_b32 s14, s16, s14 ; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX1064-NEXT: s_subb_u32 s3, s3, s12 +; GFX1064-NEXT: s_subb_u32 s3, s3, s8 ; GFX1064-NEXT: s_cmp_ge_u32 s3, s1 ; GFX1064-NEXT: s_cselect_b32 s4, -1, 0 -; GFX1064-NEXT: s_cmp_ge_u32 s13, s0 +; GFX1064-NEXT: s_cmp_ge_u32 s12, s0 ; GFX1064-NEXT: s_cselect_b32 s5, -1, 0 ; GFX1064-NEXT: s_cmp_eq_u32 s3, s1 ; GFX1064-NEXT: s_cselect_b32 s1, s5, s4 ; GFX1064-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1064-NEXT: s_cselect_b32 s5, s14, s11 -; GFX1064-NEXT: s_cselect_b32 s4, s15, s10 +; GFX1064-NEXT: s_cselect_b32 s4, s13, s10 ; GFX1064-NEXT: s_cbranch_execnz .LBB15_3 ; GFX1064-NEXT: .LBB15_2: ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll index 64d055b..4445383 100644 --- a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll +++ b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll @@ -271,7 +271,6 @@ define i1 @workgroup_nonzero() { ; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX8-NEXT: s_or_b32 s4, s12, s13 ; DAGISEL-GFX8-NEXT: s_or_b32 s4, s4, s14 -; DAGISEL-GFX8-NEXT: s_cmp_lg_u32 s4, 0 ; DAGISEL-GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 ; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31] @@ -281,7 +280,6 @@ define i1 @workgroup_nonzero() { ; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX942-NEXT: s_or_b32 s0, s12, s13 ; DAGISEL-GFX942-NEXT: s_or_b32 s0, s0, s14 -; DAGISEL-GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; DAGISEL-GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 ; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; DAGISEL-GFX942-NEXT: s_setpc_b64 s[30:31] @@ -299,8 +297,6 @@ define i1 @workgroup_nonzero() { ; DAGISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0 ; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe ; DAGISEL-GFX12-NEXT: s_or_b32 s0, s0, s1 -; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe -; DAGISEL-GFX12-NEXT: s_cmp_lg_u32 s0, 0 ; DAGISEL-GFX12-NEXT: s_cselect_b32 s0, -1, 0 ; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe ; DAGISEL-GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 @@ -311,7 +307,6 @@ define i1 @workgroup_nonzero() { ; GISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX8-NEXT: s_or_b32 s4, s12, s13 ; GISEL-GFX8-NEXT: s_or_b32 s4, s4, s14 -; GISEL-GFX8-NEXT: s_cmp_lg_u32 s4, 0 ; GISEL-GFX8-NEXT: s_cselect_b32 s4, 1, 0 ; GISEL-GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GISEL-GFX8-NEXT: s_setpc_b64 s[30:31] @@ -321,7 +316,6 @@ define i1 @workgroup_nonzero() { ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX942-NEXT: s_or_b32 s0, s12, s13 ; GISEL-GFX942-NEXT: s_or_b32 s0, s0, s14 -; GISEL-GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GISEL-GFX942-NEXT: s_cselect_b32 s0, 1, 0 ; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-GFX942-NEXT: s_setpc_b64 s[30:31] @@ -339,8 +333,6 @@ define i1 @workgroup_nonzero() { ; GISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0 ; GISEL-GFX12-NEXT: s_wait_alu 0xfffe ; GISEL-GFX12-NEXT: s_or_b32 s0, s0, s1 -; GISEL-GFX12-NEXT: s_wait_alu 0xfffe -; GISEL-GFX12-NEXT: s_cmp_lg_u32 s0, 0 ; GISEL-GFX12-NEXT: s_cselect_b32 s0, 1, 0 ; GISEL-GFX12-NEXT: s_wait_alu 0xfffe ; GISEL-GFX12-NEXT: v_mov_b32_e32 v0, s0 |