diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
35 files changed, 3221 insertions, 2319 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll index 7714c03..5171403 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll @@ -140,6 +140,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) { ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB9_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 @@ -344,6 +345,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) { ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 ; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1 ; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB17_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll index 7b81669..7b01f13 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll @@ -143,6 +143,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) { ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB9_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 @@ -347,6 +348,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) { ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0 ; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1 ; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB17_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 diff --git a/llvm/test/CodeGen/AMDGPU/abs_i32.ll b/llvm/test/CodeGen/AMDGPU/abs_i32.ll new file mode 100644 index 0000000..b53047f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/abs_i32.ll @@ -0,0 +1,92 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -check-prefixes=R600 %s + +define amdgpu_kernel void @abs_v1(ptr addrspace(1) %out, i32 %arg) { +; GFX9-LABEL: abs_v1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_abs_i32 s2, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; R600-LABEL: abs_v1: +; R600: ; %bb.0: +; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: MOV * T0.W, KC0[2].Z, +; R600-NEXT: SUB_INT * T1.W, 0.0, PV.W, +; R600-NEXT: MAX_INT T0.X, T0.W, PV.W, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %res = call i32 @llvm.abs.i32(i32 %arg, i1 false) + store i32 %res, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @abs_v2(ptr addrspace(1) %out, i32 %arg) { +; GFX9-LABEL: abs_v2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_abs_i32 s2, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; R600-LABEL: abs_v2: +; R600: ; %bb.0: +; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: SUB_INT * T0.W, 0.0, KC0[2].Z, +; R600-NEXT: MAX_INT T0.X, KC0[2].Z, PV.W, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %neg = sub i32 0, %arg + %cond = icmp sgt i32 %arg, %neg + %res = select i1 %cond, i32 %arg, i32 %neg + store i32 %res, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @abs_v3(ptr addrspace(1) %out, i32 %arg) { +; GFX9-LABEL: abs_v3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_abs_i32 s2, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; R600-LABEL: abs_v3: +; R600: ; %bb.0: +; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: SUB_INT * T0.W, 0.0, KC0[2].Z, +; R600-NEXT: MAX_INT T0.X, PV.W, KC0[2].Z, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %neg = sub i32 0, %arg + %cond = icmp sgt i32 %neg, %arg + %res = select i1 %cond, i32 %neg, i32 %arg + store i32 %res, ptr addrspace(1) %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll index 8088c1b..b72eba8 100644 --- a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll +++ b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll @@ -180,7 +180,11 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B ; CHECK-LABEL: s_add64_32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s0, s0, s2 +; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0 ; CHECK-NEXT: s_addc_u32 s1, s1, s3 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 ; CHECK-NEXT: s_addc_u32 s2, s4, 0 ; CHECK-NEXT: ; return to shader part epilog %sum64 = add i64 %val64A, %val64B @@ -195,10 +199,14 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) { ; CHECK-LABEL: s_uadd_v2i64: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s6, s2, s6 -; CHECK-NEXT: s_addc_u32 s7, s3, s7 +; CHECK-NEXT: s_add_u32 s10, s2, s6 +; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0 +; CHECK-NEXT: s_addc_u32 s8, s3, s7 ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 ; CHECK-NEXT: s_add_u32 s0, s0, s4 +; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0 ; CHECK-NEXT: s_addc_u32 s1, s1, s5 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -207,8 +215,8 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v7 ; CHECK-NEXT: v_readfirstlane_b32 s2, v6 -; CHECK-NEXT: v_mov_b32_e32 v4, s6 -; CHECK-NEXT: v_mov_b32_e32 v5, s7 +; CHECK-NEXT: v_mov_b32_e32 v4, s10 +; CHECK-NEXT: v_mov_b32_e32 v5, s8 ; CHECK-NEXT: s_mov_b32 s1, s0 ; CHECK-NEXT: s_mov_b32 s3, s2 ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -225,10 +233,14 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) { ; CHECK-LABEL: s_usub_v2i64: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_sub_u32 s6, s2, s6 -; CHECK-NEXT: s_subb_u32 s7, s3, s7 +; CHECK-NEXT: s_sub_u32 s10, s2, s6 +; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0 +; CHECK-NEXT: s_subb_u32 s8, s3, s7 ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 ; CHECK-NEXT: s_sub_u32 s0, s0, s4 +; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0 ; CHECK-NEXT: s_subb_u32 s1, s1, s5 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -237,8 +249,8 @@ define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v7 ; CHECK-NEXT: v_readfirstlane_b32 s2, v6 -; CHECK-NEXT: v_mov_b32_e32 v4, s6 -; CHECK-NEXT: v_mov_b32_e32 v5, s7 +; CHECK-NEXT: v_mov_b32_e32 v4, s10 +; CHECK-NEXT: v_mov_b32_e32 v5, s8 ; CHECK-NEXT: s_mov_b32 s1, s0 ; CHECK-NEXT: s_mov_b32 s3, s2 ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -256,6 +268,8 @@ define amdgpu_ps i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) ; CHECK-LABEL: s_uadd_i64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s0, s0, s2 +; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0 ; CHECK-NEXT: s_addc_u32 s1, s1, s3 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -278,6 +292,8 @@ define amdgpu_ps i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) { ; CHECK-LABEL: s_uadd_p1: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s0, s0, 1 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -323,6 +339,8 @@ define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) { ; CHECK-LABEL: s_usub_p1: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_sub_u32 s0, s0, 1 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 ; CHECK-NEXT: s_subb_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -345,6 +363,8 @@ define amdgpu_ps i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) { ; CHECK-LABEL: s_usub_n1: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_sub_u32 s0, s0, -1 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 ; CHECK-NEXT: s_subb_u32 s1, s1, -1 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index 51df8c3..948811e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -7821,9 +7821,10 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_addc_u32 s15, 0, s16 ; GFX6-NEXT: s_add_u32 s16, s0, s1 ; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 ; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_addc_u32 s14, s14, s15 ; GFX6-NEXT: s_mul_i32 s0, s12, s14 ; GFX6-NEXT: v_readfirstlane_b32 s1, v0 @@ -7854,6 +7855,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_add_u32 s15, s16, s0 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_addc_u32 s14, s14, s12 ; GFX6-NEXT: s_ashr_i32 s12, s7, 31 ; GFX6-NEXT: s_add_u32 s0, s6, s12 @@ -7879,50 +7881,52 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: v_readfirstlane_b32 s4, v0 ; GFX6-NEXT: s_addc_u32 s4, s4, 0 ; GFX6-NEXT: s_mul_i32 s14, s7, s14 -; GFX6-NEXT: s_add_u32 s16, s1, s14 -; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: s_add_u32 s14, s1, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s14 ; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 -; GFX6-NEXT: s_addc_u32 s17, 0, s4 +; GFX6-NEXT: s_addc_u32 s15, 0, s4 ; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_mul_i32 s4, s10, s17 +; GFX6-NEXT: s_mul_i32 s4, s10, s15 ; GFX6-NEXT: v_readfirstlane_b32 s5, v0 ; GFX6-NEXT: s_add_i32 s4, s5, s4 -; GFX6-NEXT: s_mul_i32 s5, s11, s16 -; GFX6-NEXT: s_add_i32 s18, s4, s5 -; GFX6-NEXT: s_sub_i32 s14, s7, s18 -; GFX6-NEXT: s_mul_i32 s4, s10, s16 +; GFX6-NEXT: s_mul_i32 s5, s11, s14 +; GFX6-NEXT: s_add_i32 s16, s4, s5 +; GFX6-NEXT: s_sub_i32 s17, s7, s16 +; GFX6-NEXT: s_mul_i32 s4, s10, s14 ; GFX6-NEXT: s_sub_u32 s6, s6, s4 ; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX6-NEXT: s_or_b32 s15, s4, s5 -; GFX6-NEXT: s_subb_u32 s19, s14, s11 -; GFX6-NEXT: s_sub_u32 s20, s6, s10 -; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_subb_u32 s14, s19, 0 -; GFX6-NEXT: s_cmp_ge_u32 s14, s11 -; GFX6-NEXT: s_cselect_b32 s15, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s20, s10 -; GFX6-NEXT: s_cselect_b32 s19, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s14, s11 -; GFX6-NEXT: s_cselect_b32 s14, s19, s15 -; GFX6-NEXT: s_add_u32 s15, s16, 1 -; GFX6-NEXT: s_addc_u32 s19, s17, 0 -; GFX6-NEXT: s_add_u32 s20, s16, 2 -; GFX6-NEXT: s_addc_u32 s21, s17, 0 -; GFX6-NEXT: s_cmp_lg_u32 s14, 0 -; GFX6-NEXT: s_cselect_b32 s14, s20, s15 -; GFX6-NEXT: s_cselect_b32 s15, s21, s19 +; GFX6-NEXT: s_or_b32 s18, s4, s5 +; GFX6-NEXT: s_cmp_lg_u32 s18, 0 +; GFX6-NEXT: s_subb_u32 s17, s17, s11 +; GFX6-NEXT: s_sub_u32 s19, s6, s10 +; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX6-NEXT: s_or_b32 s4, s4, s5 -; GFX6-NEXT: s_subb_u32 s4, s7, s18 +; GFX6-NEXT: s_cmp_lg_u32 s4, 0 +; GFX6-NEXT: s_subb_u32 s4, s17, 0 ; GFX6-NEXT: s_cmp_ge_u32 s4, s11 ; GFX6-NEXT: s_cselect_b32 s5, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s6, s10 -; GFX6-NEXT: s_cselect_b32 s6, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s19, s10 +; GFX6-NEXT: s_cselect_b32 s17, -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s4, s11 -; GFX6-NEXT: s_cselect_b32 s4, s6, s5 +; GFX6-NEXT: s_cselect_b32 s4, s17, s5 +; GFX6-NEXT: s_add_u32 s5, s14, 1 +; GFX6-NEXT: s_addc_u32 s17, s15, 0 +; GFX6-NEXT: s_add_u32 s19, s14, 2 +; GFX6-NEXT: s_addc_u32 s20, s15, 0 ; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_cselect_b32 s5, s15, s17 -; GFX6-NEXT: s_cselect_b32 s4, s14, s16 +; GFX6-NEXT: s_cselect_b32 s4, s19, s5 +; GFX6-NEXT: s_cselect_b32 s5, s20, s17 +; GFX6-NEXT: s_cmp_lg_u32 s18, 0 +; GFX6-NEXT: s_subb_u32 s7, s7, s16 +; GFX6-NEXT: s_cmp_ge_u32 s7, s11 +; GFX6-NEXT: s_cselect_b32 s16, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s6, s10 +; GFX6-NEXT: s_cselect_b32 s6, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s7, s11 +; GFX6-NEXT: s_cselect_b32 s6, s6, s16 +; GFX6-NEXT: s_cmp_lg_u32 s6, 0 +; GFX6-NEXT: s_cselect_b32 s5, s5, s15 +; GFX6-NEXT: s_cselect_b32 s4, s4, s14 ; GFX6-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9] ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX6-NEXT: s_sub_u32 s4, s4, s6 @@ -7945,8 +7949,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_sub_u32 s4, 0, s8 -; GFX9-NEXT: s_subb_u32 s5, 0, s9 +; GFX9-NEXT: s_sub_u32 s10, 0, s8 +; GFX9-NEXT: s_subb_u32 s11, 0, s9 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -7956,52 +7960,56 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s10, v2 -; GFX9-NEXT: v_readfirstlane_b32 s11, v1 -; GFX9-NEXT: s_mul_i32 s12, s4, s10 -; GFX9-NEXT: s_mul_hi_u32 s14, s4, s11 -; GFX9-NEXT: s_mul_i32 s13, s5, s11 -; GFX9-NEXT: s_add_i32 s12, s14, s12 -; GFX9-NEXT: s_mul_i32 s15, s4, s11 -; GFX9-NEXT: s_add_i32 s12, s12, s13 -; GFX9-NEXT: s_mul_hi_u32 s14, s11, s15 -; GFX9-NEXT: s_mul_i32 s16, s11, s12 -; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12 +; GFX9-NEXT: v_readfirstlane_b32 s12, v2 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_mul_i32 s5, s10, s12 +; GFX9-NEXT: s_mul_hi_u32 s14, s10, s4 +; GFX9-NEXT: s_mul_i32 s13, s11, s4 +; GFX9-NEXT: s_add_i32 s5, s14, s5 +; GFX9-NEXT: s_mul_i32 s15, s10, s4 +; GFX9-NEXT: s_add_i32 s5, s5, s13 +; GFX9-NEXT: s_mul_hi_u32 s14, s4, s15 +; GFX9-NEXT: s_mul_i32 s16, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s13, s4, s5 ; GFX9-NEXT: s_add_u32 s14, s14, s16 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_mul_hi_u32 s17, s10, s15 -; GFX9-NEXT: s_mul_i32 s15, s10, s15 +; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15 +; GFX9-NEXT: s_mul_i32 s15, s12, s15 ; GFX9-NEXT: s_add_u32 s14, s14, s15 -; GFX9-NEXT: s_mul_hi_u32 s16, s10, s12 +; GFX9-NEXT: s_mul_hi_u32 s16, s12, s5 ; GFX9-NEXT: s_addc_u32 s13, s13, s17 ; GFX9-NEXT: s_addc_u32 s14, s16, 0 -; GFX9-NEXT: s_mul_i32 s12, s10, s12 -; GFX9-NEXT: s_add_u32 s12, s13, s12 +; GFX9-NEXT: s_mul_i32 s5, s12, s5 +; GFX9-NEXT: s_add_u32 s5, s13, s5 ; GFX9-NEXT: s_addc_u32 s13, 0, s14 -; GFX9-NEXT: s_add_u32 s11, s11, s12 -; GFX9-NEXT: s_addc_u32 s10, s10, s13 -; GFX9-NEXT: s_mul_i32 s12, s4, s10 -; GFX9-NEXT: s_mul_hi_u32 s13, s4, s11 -; GFX9-NEXT: s_add_i32 s12, s13, s12 -; GFX9-NEXT: s_mul_i32 s5, s5, s11 -; GFX9-NEXT: s_add_i32 s12, s12, s5 -; GFX9-NEXT: s_mul_i32 s4, s4, s11 -; GFX9-NEXT: s_mul_hi_u32 s13, s10, s4 -; GFX9-NEXT: s_mul_i32 s14, s10, s4 -; GFX9-NEXT: s_mul_i32 s16, s11, s12 -; GFX9-NEXT: s_mul_hi_u32 s4, s11, s4 -; GFX9-NEXT: s_mul_hi_u32 s15, s11, s12 -; GFX9-NEXT: s_add_u32 s4, s4, s16 +; GFX9-NEXT: s_add_u32 s14, s4, s5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_addc_u32 s12, s12, s13 +; GFX9-NEXT: s_mul_i32 s4, s10, s12 +; GFX9-NEXT: s_mul_hi_u32 s5, s10, s14 +; GFX9-NEXT: s_add_i32 s4, s5, s4 +; GFX9-NEXT: s_mul_i32 s11, s11, s14 +; GFX9-NEXT: s_add_i32 s4, s4, s11 +; GFX9-NEXT: s_mul_i32 s10, s10, s14 +; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10 +; GFX9-NEXT: s_mul_i32 s13, s12, s10 +; GFX9-NEXT: s_mul_i32 s16, s14, s4 +; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10 +; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4 +; GFX9-NEXT: s_add_u32 s10, s10, s16 ; GFX9-NEXT: s_addc_u32 s15, 0, s15 -; GFX9-NEXT: s_add_u32 s4, s4, s14 -; GFX9-NEXT: s_mul_hi_u32 s5, s10, s12 -; GFX9-NEXT: s_addc_u32 s4, s15, s13 +; GFX9-NEXT: s_add_u32 s10, s10, s13 +; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4 +; GFX9-NEXT: s_addc_u32 s10, s15, s11 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_mul_i32 s12, s10, s12 -; GFX9-NEXT: s_add_u32 s4, s4, s12 -; GFX9-NEXT: s_addc_u32 s5, 0, s5 -; GFX9-NEXT: s_add_u32 s11, s11, s4 -; GFX9-NEXT: s_addc_u32 s10, s10, s5 +; GFX9-NEXT: s_mul_i32 s4, s12, s4 +; GFX9-NEXT: s_add_u32 s4, s10, s4 +; GFX9-NEXT: s_addc_u32 s10, 0, s5 +; GFX9-NEXT: s_add_u32 s11, s14, s4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_addc_u32 s10, s12, s10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: s_add_u32 s2, s2, s4 @@ -8020,35 +8028,38 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_addc_u32 s11, s12, s15 ; GFX9-NEXT: s_addc_u32 s12, s14, 0 ; GFX9-NEXT: s_mul_i32 s10, s3, s10 -; GFX9-NEXT: s_add_u32 s13, s11, s10 -; GFX9-NEXT: s_addc_u32 s12, 0, s12 -; GFX9-NEXT: s_mul_i32 s10, s8, s12 -; GFX9-NEXT: s_mul_hi_u32 s11, s8, s13 +; GFX9-NEXT: s_add_u32 s14, s11, s10 +; GFX9-NEXT: s_addc_u32 s15, 0, s12 +; GFX9-NEXT: s_mul_i32 s10, s8, s15 +; GFX9-NEXT: s_mul_hi_u32 s11, s8, s14 ; GFX9-NEXT: s_add_i32 s10, s11, s10 -; GFX9-NEXT: s_mul_i32 s11, s9, s13 -; GFX9-NEXT: s_add_i32 s14, s10, s11 -; GFX9-NEXT: s_sub_i32 s15, s3, s14 -; GFX9-NEXT: s_mul_i32 s10, s8, s13 +; GFX9-NEXT: s_mul_i32 s11, s9, s14 +; GFX9-NEXT: s_add_i32 s16, s10, s11 +; GFX9-NEXT: s_sub_i32 s12, s3, s16 +; GFX9-NEXT: s_mul_i32 s10, s8, s14 ; GFX9-NEXT: s_sub_u32 s2, s2, s10 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_subb_u32 s15, s15, s9 -; GFX9-NEXT: s_sub_u32 s16, s2, s8 -; GFX9-NEXT: s_subb_u32 s15, s15, 0 -; GFX9-NEXT: s_cmp_ge_u32 s15, s9 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GFX9-NEXT: s_subb_u32 s17, s12, s9 +; GFX9-NEXT: s_sub_u32 s18, s2, s8 +; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GFX9-NEXT: s_subb_u32 s12, s17, 0 +; GFX9-NEXT: s_cmp_ge_u32 s12, s9 +; GFX9-NEXT: s_cselect_b32 s13, -1, 0 +; GFX9-NEXT: s_cmp_ge_u32 s18, s8 ; GFX9-NEXT: s_cselect_b32 s17, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s16, s8 -; GFX9-NEXT: s_cselect_b32 s16, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s15, s9 -; GFX9-NEXT: s_cselect_b32 s15, s16, s17 -; GFX9-NEXT: s_add_u32 s16, s13, 1 -; GFX9-NEXT: s_addc_u32 s17, s12, 0 -; GFX9-NEXT: s_add_u32 s18, s13, 2 -; GFX9-NEXT: s_addc_u32 s19, s12, 0 -; GFX9-NEXT: s_cmp_lg_u32 s15, 0 -; GFX9-NEXT: s_cselect_b32 s15, s18, s16 -; GFX9-NEXT: s_cselect_b32 s16, s19, s17 +; GFX9-NEXT: s_cmp_eq_u32 s12, s9 +; GFX9-NEXT: s_cselect_b32 s12, s17, s13 +; GFX9-NEXT: s_add_u32 s13, s14, 1 +; GFX9-NEXT: s_addc_u32 s17, s15, 0 +; GFX9-NEXT: s_add_u32 s18, s14, 2 +; GFX9-NEXT: s_addc_u32 s19, s15, 0 +; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_cselect_b32 s12, s18, s13 +; GFX9-NEXT: s_cselect_b32 s13, s19, s17 ; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s3, s3, s14 +; GFX9-NEXT: s_subb_u32 s3, s3, s16 ; GFX9-NEXT: s_cmp_ge_u32 s3, s9 ; GFX9-NEXT: s_cselect_b32 s10, -1, 0 ; GFX9-NEXT: s_cmp_ge_u32 s2, s8 @@ -8056,8 +8067,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_cmp_eq_u32 s3, s9 ; GFX9-NEXT: s_cselect_b32 s2, s2, s10 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s3, s16, s12 -; GFX9-NEXT: s_cselect_b32 s2, s15, s13 +; GFX9-NEXT: s_cselect_b32 s3, s13, s15 +; GFX9-NEXT: s_cselect_b32 s2, s12, s14 ; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] ; GFX9-NEXT: s_sub_u32 s2, s2, s4 @@ -8317,9 +8328,10 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_addc_u32 s17, 0, s18 ; GFX6-NEXT: s_add_u32 s18, s12, s13 ; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0 ; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_addc_u32 s16, s16, s17 ; GFX6-NEXT: s_mul_i32 s12, s14, s16 ; GFX6-NEXT: v_readfirstlane_b32 s13, v0 @@ -8350,6 +8362,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_add_u32 s15, s18, s12 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_addc_u32 s14, s16, s14 ; GFX6-NEXT: s_ashr_i32 s12, s9, 31 ; GFX6-NEXT: s_add_u32 s8, s8, s12 @@ -8374,53 +8387,55 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 ; GFX6-NEXT: s_addc_u32 s16, s16, 0 ; GFX6-NEXT: s_mul_i32 s14, s9, s14 -; GFX6-NEXT: s_add_u32 s18, s15, s14 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: s_add_u32 s17, s15, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s17 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX6-NEXT: s_addc_u32 s19, 0, s16 -; GFX6-NEXT: s_mul_i32 s14, s6, s19 +; GFX6-NEXT: s_addc_u32 s16, 0, s16 +; GFX6-NEXT: s_mul_i32 s14, s6, s16 ; GFX6-NEXT: v_readfirstlane_b32 s15, v0 ; GFX6-NEXT: s_add_i32 s14, s15, s14 -; GFX6-NEXT: s_mul_i32 s15, s7, s18 -; GFX6-NEXT: s_add_i32 s20, s14, s15 -; GFX6-NEXT: s_sub_i32 s16, s9, s20 -; GFX6-NEXT: s_mul_i32 s14, s6, s18 +; GFX6-NEXT: s_mul_i32 s15, s7, s17 +; GFX6-NEXT: s_add_i32 s18, s14, s15 +; GFX6-NEXT: s_sub_i32 s19, s9, s18 +; GFX6-NEXT: s_mul_i32 s14, s6, s17 ; GFX6-NEXT: s_sub_u32 s8, s8, s14 ; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s17, s14, s15 -; GFX6-NEXT: s_subb_u32 s21, s16, s7 -; GFX6-NEXT: s_sub_u32 s22, s8, s6 -; GFX6-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GFX6-NEXT: s_or_b32 s16, s16, s17 -; GFX6-NEXT: s_subb_u32 s16, s21, 0 -; GFX6-NEXT: s_cmp_ge_u32 s16, s7 -; GFX6-NEXT: s_cselect_b32 s17, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s22, s6 -; GFX6-NEXT: s_cselect_b32 s21, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s16, s7 -; GFX6-NEXT: s_cselect_b32 s16, s21, s17 -; GFX6-NEXT: s_add_u32 s17, s18, 1 -; GFX6-NEXT: s_addc_u32 s21, s19, 0 -; GFX6-NEXT: s_add_u32 s22, s18, 2 -; GFX6-NEXT: s_addc_u32 s23, s19, 0 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_cselect_b32 s16, s22, s17 -; GFX6-NEXT: s_cselect_b32 s17, s23, s21 +; GFX6-NEXT: s_or_b32 s20, s14, s15 +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_subb_u32 s19, s19, s7 +; GFX6-NEXT: s_sub_u32 s21, s8, s6 +; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 ; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_subb_u32 s9, s9, s20 +; GFX6-NEXT: s_cmp_lg_u32 s14, 0 +; GFX6-NEXT: s_subb_u32 s14, s19, 0 +; GFX6-NEXT: s_cmp_ge_u32 s14, s7 +; GFX6-NEXT: s_cselect_b32 s15, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s21, s6 +; GFX6-NEXT: s_cselect_b32 s19, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s14, s7 +; GFX6-NEXT: s_cselect_b32 s14, s19, s15 +; GFX6-NEXT: s_add_u32 s15, s17, 1 +; GFX6-NEXT: s_addc_u32 s19, s16, 0 +; GFX6-NEXT: s_add_u32 s21, s17, 2 +; GFX6-NEXT: s_addc_u32 s22, s16, 0 +; GFX6-NEXT: s_cmp_lg_u32 s14, 0 +; GFX6-NEXT: s_cselect_b32 s14, s21, s15 +; GFX6-NEXT: s_cselect_b32 s15, s22, s19 +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_subb_u32 s9, s9, s18 ; GFX6-NEXT: s_cmp_ge_u32 s9, s7 -; GFX6-NEXT: s_cselect_b32 s14, -1, 0 +; GFX6-NEXT: s_cselect_b32 s18, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s8, s6 ; GFX6-NEXT: s_cselect_b32 s6, -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s9, s7 -; GFX6-NEXT: s_cselect_b32 s6, s6, s14 +; GFX6-NEXT: s_cselect_b32 s6, s6, s18 ; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_cselect_b32 s7, s17, s19 -; GFX6-NEXT: s_cselect_b32 s6, s16, s18 +; GFX6-NEXT: s_cselect_b32 s7, s15, s16 +; GFX6-NEXT: s_cselect_b32 s6, s14, s17 ; GFX6-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3] ; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] -; GFX6-NEXT: s_sub_u32 s16, s6, s2 -; GFX6-NEXT: s_subb_u32 s17, s7, s3 +; GFX6-NEXT: s_sub_u32 s14, s6, s2 +; GFX6-NEXT: s_subb_u32 s15, s7, s3 ; GFX6-NEXT: s_ashr_i32 s6, s1, 31 ; GFX6-NEXT: s_add_u32 s0, s0, s6 ; GFX6-NEXT: s_mov_b32 s7, s6 @@ -8439,39 +8454,40 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: v_readfirstlane_b32 s16, v1 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s1, s12, s14 +; GFX6-NEXT: s_mul_i32 s1, s12, s16 ; GFX6-NEXT: v_readfirstlane_b32 s3, v2 ; GFX6-NEXT: s_mul_i32 s0, s13, s2 ; GFX6-NEXT: s_add_i32 s1, s3, s1 ; GFX6-NEXT: s_add_i32 s3, s1, s0 -; GFX6-NEXT: s_mul_i32 s15, s12, s2 +; GFX6-NEXT: s_mul_i32 s17, s12, s2 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s17 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mul_i32 s4, s2, s3 ; GFX6-NEXT: v_readfirstlane_b32 s5, v2 ; GFX6-NEXT: v_readfirstlane_b32 s18, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15 +; GFX6-NEXT: v_mul_hi_u32 v0, v1, s17 ; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3 ; GFX6-NEXT: s_add_u32 s4, s18, s4 ; GFX6-NEXT: s_addc_u32 s5, 0, s5 -; GFX6-NEXT: s_mul_i32 s15, s14, s15 +; GFX6-NEXT: s_mul_i32 s17, s16, s17 ; GFX6-NEXT: v_readfirstlane_b32 s18, v0 -; GFX6-NEXT: s_add_u32 s4, s4, s15 +; GFX6-NEXT: s_add_u32 s4, s4, s17 ; GFX6-NEXT: s_addc_u32 s4, s5, s18 ; GFX6-NEXT: v_readfirstlane_b32 s5, v1 ; GFX6-NEXT: s_addc_u32 s5, s5, 0 -; GFX6-NEXT: s_mul_i32 s3, s14, s3 +; GFX6-NEXT: s_mul_i32 s3, s16, s3 ; GFX6-NEXT: s_add_u32 s3, s4, s3 ; GFX6-NEXT: s_addc_u32 s4, 0, s5 ; GFX6-NEXT: s_add_u32 s5, s2, s3 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_addc_u32 s4, s14, s4 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: s_addc_u32 s4, s16, s4 ; GFX6-NEXT: s_mul_i32 s2, s12, s4 ; GFX6-NEXT: v_readfirstlane_b32 s3, v0 ; GFX6-NEXT: s_add_i32 s2, s3, s2 @@ -8485,14 +8501,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: s_mul_i32 s13, s5, s2 -; GFX6-NEXT: v_readfirstlane_b32 s15, v2 -; GFX6-NEXT: s_add_u32 s13, s15, s13 -; GFX6-NEXT: v_readfirstlane_b32 s14, v0 +; GFX6-NEXT: v_readfirstlane_b32 s17, v2 +; GFX6-NEXT: s_add_u32 s13, s17, s13 +; GFX6-NEXT: v_readfirstlane_b32 s16, v0 ; GFX6-NEXT: s_mul_i32 s3, s4, s3 -; GFX6-NEXT: s_addc_u32 s14, 0, s14 +; GFX6-NEXT: s_addc_u32 s16, 0, s16 ; GFX6-NEXT: v_readfirstlane_b32 s12, v3 ; GFX6-NEXT: s_add_u32 s3, s13, s3 -; GFX6-NEXT: s_addc_u32 s3, s14, s12 +; GFX6-NEXT: s_addc_u32 s3, s16, s12 ; GFX6-NEXT: v_readfirstlane_b32 s12, v1 ; GFX6-NEXT: s_addc_u32 s12, s12, 0 ; GFX6-NEXT: s_mul_i32 s2, s4, s2 @@ -8501,6 +8517,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_add_u32 s13, s5, s2 ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: s_addc_u32 s12, s4, s12 ; GFX6-NEXT: s_ashr_i32 s4, s11, 31 ; GFX6-NEXT: s_add_u32 s2, s10, s4 @@ -8512,70 +8529,72 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mov_b32_e32 v2, s13 ; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2 ; GFX6-NEXT: s_mul_i32 s2, s10, s12 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: v_readfirstlane_b32 s16, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s11, v2 -; GFX6-NEXT: v_readfirstlane_b32 s15, v3 +; GFX6-NEXT: v_readfirstlane_b32 s17, v3 ; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 -; GFX6-NEXT: s_add_u32 s2, s15, s2 -; GFX6-NEXT: s_addc_u32 s14, 0, s14 +; GFX6-NEXT: s_add_u32 s2, s17, s2 +; GFX6-NEXT: s_addc_u32 s16, 0, s16 ; GFX6-NEXT: s_mul_i32 s13, s11, s13 -; GFX6-NEXT: v_readfirstlane_b32 s15, v1 +; GFX6-NEXT: v_readfirstlane_b32 s17, v1 ; GFX6-NEXT: s_add_u32 s2, s2, s13 -; GFX6-NEXT: s_addc_u32 s2, s14, s15 +; GFX6-NEXT: s_addc_u32 s2, s16, s17 ; GFX6-NEXT: v_readfirstlane_b32 s13, v0 ; GFX6-NEXT: s_addc_u32 s13, s13, 0 ; GFX6-NEXT: s_mul_i32 s12, s11, s12 -; GFX6-NEXT: s_add_u32 s18, s2, s12 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: s_add_u32 s16, s2, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: s_addc_u32 s19, 0, s13 -; GFX6-NEXT: s_mul_i32 s12, s8, s19 +; GFX6-NEXT: s_addc_u32 s17, 0, s13 +; GFX6-NEXT: s_mul_i32 s12, s8, s17 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_readfirstlane_b32 s13, v0 ; GFX6-NEXT: s_add_i32 s12, s13, s12 -; GFX6-NEXT: s_mul_i32 s13, s9, s18 -; GFX6-NEXT: s_add_i32 s20, s12, s13 -; GFX6-NEXT: s_sub_i32 s14, s11, s20 -; GFX6-NEXT: s_mul_i32 s12, s8, s18 +; GFX6-NEXT: s_mul_i32 s13, s9, s16 +; GFX6-NEXT: s_add_i32 s18, s12, s13 +; GFX6-NEXT: s_sub_i32 s19, s11, s18 +; GFX6-NEXT: s_mul_i32 s12, s8, s16 ; GFX6-NEXT: s_sub_u32 s10, s10, s12 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s15, s12, s13 -; GFX6-NEXT: s_subb_u32 s21, s14, s9 -; GFX6-NEXT: s_sub_u32 s22, s10, s8 -; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_subb_u32 s14, s21, 0 -; GFX6-NEXT: s_cmp_ge_u32 s14, s9 -; GFX6-NEXT: s_cselect_b32 s15, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s22, s8 -; GFX6-NEXT: s_cselect_b32 s21, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s14, s9 -; GFX6-NEXT: s_cselect_b32 s14, s21, s15 -; GFX6-NEXT: s_add_u32 s15, s18, 1 -; GFX6-NEXT: s_addc_u32 s21, s19, 0 -; GFX6-NEXT: s_add_u32 s22, s18, 2 -; GFX6-NEXT: s_addc_u32 s23, s19, 0 -; GFX6-NEXT: s_cmp_lg_u32 s14, 0 -; GFX6-NEXT: s_cselect_b32 s14, s22, s15 -; GFX6-NEXT: s_cselect_b32 s15, s23, s21 +; GFX6-NEXT: s_or_b32 s20, s12, s13 +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_subb_u32 s19, s19, s9 +; GFX6-NEXT: s_sub_u32 s21, s10, s8 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s11, s11, s20 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_subb_u32 s12, s19, 0 +; GFX6-NEXT: s_cmp_ge_u32 s12, s9 +; GFX6-NEXT: s_cselect_b32 s13, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s21, s8 +; GFX6-NEXT: s_cselect_b32 s19, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s12, s9 +; GFX6-NEXT: s_cselect_b32 s12, s19, s13 +; GFX6-NEXT: s_add_u32 s13, s16, 1 +; GFX6-NEXT: s_addc_u32 s19, s17, 0 +; GFX6-NEXT: s_add_u32 s21, s16, 2 +; GFX6-NEXT: s_addc_u32 s22, s17, 0 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_cselect_b32 s12, s21, s13 +; GFX6-NEXT: s_cselect_b32 s13, s22, s19 +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_subb_u32 s11, s11, s18 ; GFX6-NEXT: s_cmp_ge_u32 s11, s9 -; GFX6-NEXT: s_cselect_b32 s12, -1, 0 +; GFX6-NEXT: s_cselect_b32 s18, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s10, s8 ; GFX6-NEXT: s_cselect_b32 s8, -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s11, s9 -; GFX6-NEXT: s_cselect_b32 s8, s8, s12 +; GFX6-NEXT: s_cselect_b32 s8, s8, s18 ; GFX6-NEXT: s_cmp_lg_u32 s8, 0 -; GFX6-NEXT: s_cselect_b32 s9, s15, s19 -; GFX6-NEXT: s_cselect_b32 s8, s14, s18 +; GFX6-NEXT: s_cselect_b32 s9, s13, s17 +; GFX6-NEXT: s_cselect_b32 s8, s12, s16 ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX6-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5] ; GFX6-NEXT: s_sub_u32 s4, s6, s4 ; GFX6-NEXT: s_subb_u32 s5, s7, s5 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: v_mov_b32_e32 v1, s17 +; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: v_mov_b32_e32 v1, s15 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8595,8 +8614,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_u32 s12, 0, s6 -; GFX9-NEXT: s_subb_u32 s13, 0, s7 +; GFX9-NEXT: s_sub_u32 s14, 0, s6 +; GFX9-NEXT: s_subb_u32 s15, 0, s7 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -8605,52 +8624,56 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s14, v1 -; GFX9-NEXT: v_readfirstlane_b32 s15, v0 -; GFX9-NEXT: s_mul_i32 s16, s12, s14 -; GFX9-NEXT: s_mul_hi_u32 s18, s12, s15 -; GFX9-NEXT: s_mul_i32 s17, s13, s15 -; GFX9-NEXT: s_add_i32 s16, s18, s16 -; GFX9-NEXT: s_mul_i32 s19, s12, s15 -; GFX9-NEXT: s_add_i32 s16, s16, s17 -; GFX9-NEXT: s_mul_hi_u32 s18, s15, s19 -; GFX9-NEXT: s_mul_i32 s20, s15, s16 -; GFX9-NEXT: s_mul_hi_u32 s17, s15, s16 +; GFX9-NEXT: v_readfirstlane_b32 s16, v1 +; GFX9-NEXT: v_readfirstlane_b32 s12, v0 +; GFX9-NEXT: s_mul_i32 s13, s14, s16 +; GFX9-NEXT: s_mul_hi_u32 s18, s14, s12 +; GFX9-NEXT: s_mul_i32 s17, s15, s12 +; GFX9-NEXT: s_add_i32 s13, s18, s13 +; GFX9-NEXT: s_mul_i32 s19, s14, s12 +; GFX9-NEXT: s_add_i32 s13, s13, s17 +; GFX9-NEXT: s_mul_hi_u32 s18, s12, s19 +; GFX9-NEXT: s_mul_i32 s20, s12, s13 +; GFX9-NEXT: s_mul_hi_u32 s17, s12, s13 ; GFX9-NEXT: s_add_u32 s18, s18, s20 ; GFX9-NEXT: s_addc_u32 s17, 0, s17 -; GFX9-NEXT: s_mul_hi_u32 s20, s14, s19 -; GFX9-NEXT: s_mul_i32 s19, s14, s19 +; GFX9-NEXT: s_mul_hi_u32 s20, s16, s19 +; GFX9-NEXT: s_mul_i32 s19, s16, s19 ; GFX9-NEXT: s_add_u32 s18, s18, s19 -; GFX9-NEXT: s_mul_hi_u32 s21, s14, s16 +; GFX9-NEXT: s_mul_hi_u32 s21, s16, s13 ; GFX9-NEXT: s_addc_u32 s17, s17, s20 ; GFX9-NEXT: s_addc_u32 s18, s21, 0 -; GFX9-NEXT: s_mul_i32 s16, s14, s16 -; GFX9-NEXT: s_add_u32 s16, s17, s16 +; GFX9-NEXT: s_mul_i32 s13, s16, s13 +; GFX9-NEXT: s_add_u32 s13, s17, s13 ; GFX9-NEXT: s_addc_u32 s17, 0, s18 -; GFX9-NEXT: s_add_u32 s15, s15, s16 -; GFX9-NEXT: s_addc_u32 s14, s14, s17 -; GFX9-NEXT: s_mul_i32 s16, s12, s14 -; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15 -; GFX9-NEXT: s_add_i32 s16, s17, s16 -; GFX9-NEXT: s_mul_i32 s13, s13, s15 -; GFX9-NEXT: s_add_i32 s16, s16, s13 -; GFX9-NEXT: s_mul_i32 s12, s12, s15 -; GFX9-NEXT: s_mul_hi_u32 s17, s14, s12 -; GFX9-NEXT: s_mul_i32 s18, s14, s12 -; GFX9-NEXT: s_mul_i32 s20, s15, s16 -; GFX9-NEXT: s_mul_hi_u32 s12, s15, s12 -; GFX9-NEXT: s_mul_hi_u32 s19, s15, s16 -; GFX9-NEXT: s_add_u32 s12, s12, s20 +; GFX9-NEXT: s_add_u32 s18, s12, s13 +; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GFX9-NEXT: s_addc_u32 s16, s16, s17 +; GFX9-NEXT: s_mul_i32 s12, s14, s16 +; GFX9-NEXT: s_mul_hi_u32 s13, s14, s18 +; GFX9-NEXT: s_add_i32 s12, s13, s12 +; GFX9-NEXT: s_mul_i32 s15, s15, s18 +; GFX9-NEXT: s_add_i32 s12, s12, s15 +; GFX9-NEXT: s_mul_i32 s14, s14, s18 +; GFX9-NEXT: s_mul_hi_u32 s15, s16, s14 +; GFX9-NEXT: s_mul_i32 s17, s16, s14 +; GFX9-NEXT: s_mul_i32 s20, s18, s12 +; GFX9-NEXT: s_mul_hi_u32 s14, s18, s14 +; GFX9-NEXT: s_mul_hi_u32 s19, s18, s12 +; GFX9-NEXT: s_add_u32 s14, s14, s20 ; GFX9-NEXT: s_addc_u32 s19, 0, s19 -; GFX9-NEXT: s_add_u32 s12, s12, s18 -; GFX9-NEXT: s_mul_hi_u32 s13, s14, s16 -; GFX9-NEXT: s_addc_u32 s12, s19, s17 +; GFX9-NEXT: s_add_u32 s14, s14, s17 +; GFX9-NEXT: s_mul_hi_u32 s13, s16, s12 +; GFX9-NEXT: s_addc_u32 s14, s19, s15 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NEXT: s_mul_i32 s16, s14, s16 -; GFX9-NEXT: s_add_u32 s12, s12, s16 -; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_add_u32 s15, s15, s12 -; GFX9-NEXT: s_addc_u32 s14, s14, s13 +; GFX9-NEXT: s_mul_i32 s12, s16, s12 +; GFX9-NEXT: s_add_u32 s12, s14, s12 +; GFX9-NEXT: s_addc_u32 s14, 0, s13 +; GFX9-NEXT: s_add_u32 s15, s18, s12 +; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GFX9-NEXT: s_addc_u32 s14, s16, s14 ; GFX9-NEXT: s_ashr_i32 s12, s9, 31 ; GFX9-NEXT: s_add_u32 s8, s8, s12 ; GFX9-NEXT: s_mov_b32 s13, s12 @@ -8668,35 +8691,38 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_addc_u32 s15, s16, s19 ; GFX9-NEXT: s_addc_u32 s16, s18, 0 ; GFX9-NEXT: s_mul_i32 s14, s9, s14 -; GFX9-NEXT: s_add_u32 s17, s15, s14 -; GFX9-NEXT: s_addc_u32 s16, 0, s16 -; GFX9-NEXT: s_mul_i32 s14, s6, s16 -; GFX9-NEXT: s_mul_hi_u32 s15, s6, s17 +; GFX9-NEXT: s_add_u32 s18, s15, s14 +; GFX9-NEXT: s_addc_u32 s19, 0, s16 +; GFX9-NEXT: s_mul_i32 s14, s6, s19 +; GFX9-NEXT: s_mul_hi_u32 s15, s6, s18 ; GFX9-NEXT: s_add_i32 s14, s15, s14 -; GFX9-NEXT: s_mul_i32 s15, s7, s17 -; GFX9-NEXT: s_add_i32 s18, s14, s15 -; GFX9-NEXT: s_sub_i32 s19, s9, s18 -; GFX9-NEXT: s_mul_i32 s14, s6, s17 +; GFX9-NEXT: s_mul_i32 s15, s7, s18 +; GFX9-NEXT: s_add_i32 s20, s14, s15 +; GFX9-NEXT: s_sub_i32 s16, s9, s20 +; GFX9-NEXT: s_mul_i32 s14, s6, s18 ; GFX9-NEXT: s_sub_u32 s8, s8, s14 ; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX9-NEXT: s_subb_u32 s19, s19, s7 -; GFX9-NEXT: s_sub_u32 s20, s8, s6 -; GFX9-NEXT: s_subb_u32 s19, s19, 0 -; GFX9-NEXT: s_cmp_ge_u32 s19, s7 +; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GFX9-NEXT: s_subb_u32 s21, s16, s7 +; GFX9-NEXT: s_sub_u32 s22, s8, s6 +; GFX9-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GFX9-NEXT: s_subb_u32 s16, s21, 0 +; GFX9-NEXT: s_cmp_ge_u32 s16, s7 +; GFX9-NEXT: s_cselect_b32 s17, -1, 0 +; GFX9-NEXT: s_cmp_ge_u32 s22, s6 ; GFX9-NEXT: s_cselect_b32 s21, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s20, s6 -; GFX9-NEXT: s_cselect_b32 s20, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s19, s7 -; GFX9-NEXT: s_cselect_b32 s19, s20, s21 -; GFX9-NEXT: s_add_u32 s20, s17, 1 -; GFX9-NEXT: s_addc_u32 s21, s16, 0 -; GFX9-NEXT: s_add_u32 s22, s17, 2 -; GFX9-NEXT: s_addc_u32 s23, s16, 0 -; GFX9-NEXT: s_cmp_lg_u32 s19, 0 -; GFX9-NEXT: s_cselect_b32 s19, s22, s20 -; GFX9-NEXT: s_cselect_b32 s20, s23, s21 +; GFX9-NEXT: s_cmp_eq_u32 s16, s7 +; GFX9-NEXT: s_cselect_b32 s16, s21, s17 +; GFX9-NEXT: s_add_u32 s17, s18, 1 +; GFX9-NEXT: s_addc_u32 s21, s19, 0 +; GFX9-NEXT: s_add_u32 s22, s18, 2 +; GFX9-NEXT: s_addc_u32 s23, s19, 0 +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 +; GFX9-NEXT: s_cselect_b32 s16, s22, s17 +; GFX9-NEXT: s_cselect_b32 s17, s23, s21 ; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GFX9-NEXT: s_subb_u32 s9, s9, s18 +; GFX9-NEXT: s_subb_u32 s9, s9, s20 ; GFX9-NEXT: s_cmp_ge_u32 s9, s7 ; GFX9-NEXT: s_cselect_b32 s14, -1, 0 ; GFX9-NEXT: s_cmp_ge_u32 s8, s6 @@ -8704,12 +8730,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cmp_eq_u32 s9, s7 ; GFX9-NEXT: s_cselect_b32 s6, s6, s14 ; GFX9-NEXT: s_cmp_lg_u32 s6, 0 -; GFX9-NEXT: s_cselect_b32 s7, s20, s16 -; GFX9-NEXT: s_cselect_b32 s6, s19, s17 +; GFX9-NEXT: s_cselect_b32 s7, s17, s19 +; GFX9-NEXT: s_cselect_b32 s6, s16, s18 ; GFX9-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3] ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] -; GFX9-NEXT: s_sub_u32 s12, s6, s2 -; GFX9-NEXT: s_subb_u32 s13, s7, s3 +; GFX9-NEXT: s_sub_u32 s14, s6, s2 +; GFX9-NEXT: s_subb_u32 s15, s7, s3 ; GFX9-NEXT: s_ashr_i32 s2, s1, 31 ; GFX9-NEXT: s_add_u32 s0, s0, s2 ; GFX9-NEXT: s_mov_b32 s3, s2 @@ -8718,8 +8744,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_sub_u32 s4, 0, s6 -; GFX9-NEXT: s_subb_u32 s5, 0, s7 +; GFX9-NEXT: s_sub_u32 s8, 0, s6 +; GFX9-NEXT: s_subb_u32 s9, 0, s7 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -8729,98 +8755,105 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: v_readfirstlane_b32 s15, v2 -; GFX9-NEXT: s_mul_hi_u32 s14, s4, s8 -; GFX9-NEXT: s_mul_i32 s16, s4, s15 -; GFX9-NEXT: s_mul_i32 s9, s5, s8 -; GFX9-NEXT: s_add_i32 s14, s14, s16 -; GFX9-NEXT: s_add_i32 s14, s14, s9 -; GFX9-NEXT: s_mul_i32 s17, s4, s8 -; GFX9-NEXT: s_mul_i32 s16, s8, s14 -; GFX9-NEXT: s_mul_hi_u32 s18, s8, s17 -; GFX9-NEXT: s_mul_hi_u32 s9, s8, s14 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: v_readfirstlane_b32 s13, v2 +; GFX9-NEXT: s_mul_hi_u32 s12, s8, s4 +; GFX9-NEXT: s_mul_i32 s16, s8, s13 +; GFX9-NEXT: s_mul_i32 s5, s9, s4 +; GFX9-NEXT: s_add_i32 s12, s12, s16 +; GFX9-NEXT: s_add_i32 s12, s12, s5 +; GFX9-NEXT: s_mul_i32 s17, s8, s4 +; GFX9-NEXT: s_mul_i32 s16, s4, s12 +; GFX9-NEXT: s_mul_hi_u32 s18, s4, s17 +; GFX9-NEXT: s_mul_hi_u32 s5, s4, s12 ; GFX9-NEXT: s_add_u32 s16, s18, s16 -; GFX9-NEXT: s_addc_u32 s9, 0, s9 -; GFX9-NEXT: s_mul_hi_u32 s19, s15, s17 -; GFX9-NEXT: s_mul_i32 s17, s15, s17 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_mul_hi_u32 s19, s13, s17 +; GFX9-NEXT: s_mul_i32 s17, s13, s17 ; GFX9-NEXT: s_add_u32 s16, s16, s17 -; GFX9-NEXT: s_mul_hi_u32 s18, s15, s14 -; GFX9-NEXT: s_addc_u32 s9, s9, s19 +; GFX9-NEXT: s_mul_hi_u32 s18, s13, s12 +; GFX9-NEXT: s_addc_u32 s5, s5, s19 ; GFX9-NEXT: s_addc_u32 s16, s18, 0 -; GFX9-NEXT: s_mul_i32 s14, s15, s14 -; GFX9-NEXT: s_add_u32 s9, s9, s14 -; GFX9-NEXT: s_addc_u32 s14, 0, s16 -; GFX9-NEXT: s_add_u32 s8, s8, s9 -; GFX9-NEXT: s_addc_u32 s9, s15, s14 -; GFX9-NEXT: s_mul_i32 s14, s4, s9 -; GFX9-NEXT: s_mul_hi_u32 s15, s4, s8 -; GFX9-NEXT: s_add_i32 s14, s15, s14 -; GFX9-NEXT: s_mul_i32 s5, s5, s8 -; GFX9-NEXT: s_add_i32 s14, s14, s5 -; GFX9-NEXT: s_mul_i32 s4, s4, s8 -; GFX9-NEXT: s_mul_hi_u32 s15, s9, s4 -; GFX9-NEXT: s_mul_i32 s16, s9, s4 -; GFX9-NEXT: s_mul_i32 s18, s8, s14 -; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 -; GFX9-NEXT: s_mul_hi_u32 s17, s8, s14 -; GFX9-NEXT: s_add_u32 s4, s4, s18 +; GFX9-NEXT: s_mul_i32 s12, s13, s12 +; GFX9-NEXT: s_add_u32 s5, s5, s12 +; GFX9-NEXT: s_addc_u32 s12, 0, s16 +; GFX9-NEXT: s_add_u32 s16, s4, s5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_addc_u32 s12, s13, s12 +; GFX9-NEXT: s_mul_i32 s4, s8, s12 +; GFX9-NEXT: s_mul_hi_u32 s5, s8, s16 +; GFX9-NEXT: s_add_i32 s4, s5, s4 +; GFX9-NEXT: s_mul_i32 s9, s9, s16 +; GFX9-NEXT: s_add_i32 s4, s4, s9 +; GFX9-NEXT: s_mul_i32 s8, s8, s16 +; GFX9-NEXT: s_mul_hi_u32 s9, s12, s8 +; GFX9-NEXT: s_mul_i32 s13, s12, s8 +; GFX9-NEXT: s_mul_i32 s18, s16, s4 +; GFX9-NEXT: s_mul_hi_u32 s8, s16, s8 +; GFX9-NEXT: s_mul_hi_u32 s17, s16, s4 +; GFX9-NEXT: s_add_u32 s8, s8, s18 ; GFX9-NEXT: s_addc_u32 s17, 0, s17 -; GFX9-NEXT: s_add_u32 s4, s4, s16 -; GFX9-NEXT: s_mul_hi_u32 s5, s9, s14 -; GFX9-NEXT: s_addc_u32 s4, s17, s15 +; GFX9-NEXT: s_add_u32 s8, s8, s13 +; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4 +; GFX9-NEXT: s_addc_u32 s8, s17, s9 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_mul_i32 s14, s9, s14 -; GFX9-NEXT: s_add_u32 s4, s4, s14 -; GFX9-NEXT: s_addc_u32 s5, 0, s5 -; GFX9-NEXT: s_add_u32 s14, s8, s4 -; GFX9-NEXT: s_addc_u32 s15, s9, s5 +; GFX9-NEXT: s_mul_i32 s4, s12, s4 +; GFX9-NEXT: s_add_u32 s4, s8, s4 +; GFX9-NEXT: s_addc_u32 s8, 0, s5 +; GFX9-NEXT: s_add_u32 s13, s16, s4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_addc_u32 s12, s12, s8 ; GFX9-NEXT: s_ashr_i32 s4, s11, 31 ; GFX9-NEXT: s_add_u32 s8, s10, s4 ; GFX9-NEXT: s_mov_b32 s5, s4 ; GFX9-NEXT: s_addc_u32 s9, s11, s4 ; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5] -; GFX9-NEXT: s_mul_i32 s11, s8, s15 -; GFX9-NEXT: s_mul_hi_u32 s16, s8, s14 -; GFX9-NEXT: s_mul_hi_u32 s10, s8, s15 +; GFX9-NEXT: s_mul_i32 s11, s8, s12 +; GFX9-NEXT: s_mul_hi_u32 s16, s8, s13 +; GFX9-NEXT: s_mul_hi_u32 s10, s8, s12 ; GFX9-NEXT: s_add_u32 s11, s16, s11 ; GFX9-NEXT: s_addc_u32 s10, 0, s10 -; GFX9-NEXT: s_mul_hi_u32 s17, s9, s14 -; GFX9-NEXT: s_mul_i32 s14, s9, s14 -; GFX9-NEXT: s_add_u32 s11, s11, s14 -; GFX9-NEXT: s_mul_hi_u32 s16, s9, s15 +; GFX9-NEXT: s_mul_hi_u32 s17, s9, s13 +; GFX9-NEXT: s_mul_i32 s13, s9, s13 +; GFX9-NEXT: s_add_u32 s11, s11, s13 +; GFX9-NEXT: s_mul_hi_u32 s16, s9, s12 ; GFX9-NEXT: s_addc_u32 s10, s10, s17 ; GFX9-NEXT: s_addc_u32 s11, s16, 0 -; GFX9-NEXT: s_mul_i32 s14, s9, s15 -; GFX9-NEXT: s_add_u32 s14, s10, s14 -; GFX9-NEXT: s_addc_u32 s15, 0, s11 -; GFX9-NEXT: s_mul_i32 s10, s6, s15 -; GFX9-NEXT: s_mul_hi_u32 s11, s6, s14 +; GFX9-NEXT: s_mul_i32 s12, s9, s12 +; GFX9-NEXT: s_add_u32 s16, s10, s12 +; GFX9-NEXT: s_addc_u32 s17, 0, s11 +; GFX9-NEXT: s_mul_i32 s10, s6, s17 +; GFX9-NEXT: s_mul_hi_u32 s11, s6, s16 ; GFX9-NEXT: s_add_i32 s10, s11, s10 -; GFX9-NEXT: s_mul_i32 s11, s7, s14 -; GFX9-NEXT: s_add_i32 s16, s10, s11 -; GFX9-NEXT: s_sub_i32 s17, s9, s16 -; GFX9-NEXT: s_mul_i32 s10, s6, s14 +; GFX9-NEXT: s_mul_i32 s11, s7, s16 +; GFX9-NEXT: s_add_i32 s18, s10, s11 +; GFX9-NEXT: s_sub_i32 s12, s9, s18 +; GFX9-NEXT: s_mul_i32 s10, s6, s16 ; GFX9-NEXT: s_sub_u32 s8, s8, s10 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_subb_u32 s17, s17, s7 -; GFX9-NEXT: s_sub_u32 s18, s8, s6 -; GFX9-NEXT: s_subb_u32 s17, s17, 0 -; GFX9-NEXT: s_cmp_ge_u32 s17, s7 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GFX9-NEXT: s_subb_u32 s19, s12, s7 +; GFX9-NEXT: s_sub_u32 s20, s8, s6 +; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GFX9-NEXT: s_subb_u32 s12, s19, 0 +; GFX9-NEXT: s_cmp_ge_u32 s12, s7 +; GFX9-NEXT: s_cselect_b32 s13, -1, 0 +; GFX9-NEXT: s_cmp_ge_u32 s20, s6 ; GFX9-NEXT: s_cselect_b32 s19, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s18, s6 -; GFX9-NEXT: s_cselect_b32 s18, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s17, s7 -; GFX9-NEXT: s_cselect_b32 s17, s18, s19 -; GFX9-NEXT: s_add_u32 s18, s14, 1 -; GFX9-NEXT: s_addc_u32 s19, s15, 0 -; GFX9-NEXT: s_add_u32 s20, s14, 2 -; GFX9-NEXT: s_addc_u32 s21, s15, 0 -; GFX9-NEXT: s_cmp_lg_u32 s17, 0 -; GFX9-NEXT: s_cselect_b32 s17, s20, s18 -; GFX9-NEXT: s_cselect_b32 s18, s21, s19 +; GFX9-NEXT: s_cmp_eq_u32 s12, s7 +; GFX9-NEXT: s_cselect_b32 s12, s19, s13 +; GFX9-NEXT: s_add_u32 s13, s16, 1 +; GFX9-NEXT: s_addc_u32 s19, s17, 0 +; GFX9-NEXT: s_add_u32 s20, s16, 2 +; GFX9-NEXT: s_addc_u32 s21, s17, 0 +; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_cselect_b32 s12, s20, s13 +; GFX9-NEXT: s_cselect_b32 s13, s21, s19 ; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s9, s9, s16 +; GFX9-NEXT: s_subb_u32 s9, s9, s18 ; GFX9-NEXT: s_cmp_ge_u32 s9, s7 ; GFX9-NEXT: s_cselect_b32 s10, -1, 0 ; GFX9-NEXT: s_cmp_ge_u32 s8, s6 @@ -8828,14 +8861,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cmp_eq_u32 s9, s7 ; GFX9-NEXT: s_cselect_b32 s6, s6, s10 ; GFX9-NEXT: s_cmp_lg_u32 s6, 0 -; GFX9-NEXT: s_cselect_b32 s7, s18, s15 -; GFX9-NEXT: s_cselect_b32 s6, s17, s14 +; GFX9-NEXT: s_cselect_b32 s7, s13, s17 +; GFX9-NEXT: s_cselect_b32 s6, s12, s16 ; GFX9-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] ; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], s[2:3] ; GFX9-NEXT: s_sub_u32 s2, s4, s2 ; GFX9-NEXT: s_subb_u32 s3, s5, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: v_mov_b32_e32 v2, s15 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -9056,9 +9089,10 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_addc_u32 s13, 0, s14 ; GFX6-NEXT: s_add_u32 s14, s0, s1 ; GFX6-NEXT: v_mov_b32_e32 v0, s14 -; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 ; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_addc_u32 s12, s12, s13 ; GFX6-NEXT: s_mul_i32 s0, s10, s12 ; GFX6-NEXT: v_readfirstlane_b32 s1, v0 @@ -9089,6 +9123,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_add_u32 s13, s14, s0 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_addc_u32 s12, s12, s10 ; GFX6-NEXT: s_ashr_i32 s10, s7, 31 ; GFX6-NEXT: s_add_u32 s0, s6, s10 @@ -9123,43 +9158,46 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: v_readfirstlane_b32 s5, v0 ; GFX6-NEXT: s_add_i32 s4, s5, s4 ; GFX6-NEXT: s_mul_i32 s5, s9, s12 -; GFX6-NEXT: s_add_i32 s14, s4, s5 -; GFX6-NEXT: s_sub_i32 s13, s7, s14 +; GFX6-NEXT: s_add_i32 s13, s4, s5 +; GFX6-NEXT: s_sub_i32 s14, s7, s13 ; GFX6-NEXT: s_mul_i32 s4, s8, s12 ; GFX6-NEXT: s_sub_u32 s6, s6, s4 ; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX6-NEXT: s_or_b32 s12, s4, s5 -; GFX6-NEXT: s_subb_u32 s15, s13, s9 -; GFX6-NEXT: s_sub_u32 s16, s6, s8 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s17, s12, s13 -; GFX6-NEXT: s_subb_u32 s17, s15, 0 -; GFX6-NEXT: s_cmp_ge_u32 s17, s9 -; GFX6-NEXT: s_cselect_b32 s18, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s16, s8 -; GFX6-NEXT: s_cselect_b32 s19, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s17, s9 -; GFX6-NEXT: s_cselect_b32 s18, s19, s18 -; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s15, s15, s9 -; GFX6-NEXT: s_sub_u32 s19, s16, s8 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s12, s15, 0 -; GFX6-NEXT: s_cmp_lg_u32 s18, 0 -; GFX6-NEXT: s_cselect_b32 s13, s19, s16 -; GFX6-NEXT: s_cselect_b32 s12, s12, s17 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_subb_u32 s14, s14, s9 +; GFX6-NEXT: s_sub_u32 s15, s6, s8 +; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX6-NEXT: s_or_b32 s4, s4, s5 -; GFX6-NEXT: s_subb_u32 s4, s7, s14 -; GFX6-NEXT: s_cmp_ge_u32 s4, s9 +; GFX6-NEXT: s_cmp_lg_u32 s4, 0 +; GFX6-NEXT: s_subb_u32 s16, s14, 0 +; GFX6-NEXT: s_cmp_ge_u32 s16, s9 ; GFX6-NEXT: s_cselect_b32 s5, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s6, s8 +; GFX6-NEXT: s_cmp_ge_u32 s15, s8 +; GFX6-NEXT: s_cselect_b32 s17, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s16, s9 +; GFX6-NEXT: s_cselect_b32 s17, s17, s5 +; GFX6-NEXT: s_cmp_lg_u32 s4, 0 +; GFX6-NEXT: s_subb_u32 s14, s14, s9 +; GFX6-NEXT: s_sub_u32 s18, s15, s8 +; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX6-NEXT: s_or_b32 s4, s4, s5 +; GFX6-NEXT: s_cmp_lg_u32 s4, 0 +; GFX6-NEXT: s_subb_u32 s4, s14, 0 +; GFX6-NEXT: s_cmp_lg_u32 s17, 0 +; GFX6-NEXT: s_cselect_b32 s14, s18, s15 +; GFX6-NEXT: s_cselect_b32 s4, s4, s16 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_subb_u32 s5, s7, s13 +; GFX6-NEXT: s_cmp_ge_u32 s5, s9 ; GFX6-NEXT: s_cselect_b32 s7, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s4, s9 -; GFX6-NEXT: s_cselect_b32 s5, s7, s5 -; GFX6-NEXT: s_cmp_lg_u32 s5, 0 -; GFX6-NEXT: s_cselect_b32 s5, s12, s4 -; GFX6-NEXT: s_cselect_b32 s4, s13, s6 +; GFX6-NEXT: s_cmp_ge_u32 s6, s8 +; GFX6-NEXT: s_cselect_b32 s8, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s5, s9 +; GFX6-NEXT: s_cselect_b32 s7, s8, s7 +; GFX6-NEXT: s_cmp_lg_u32 s7, 0 +; GFX6-NEXT: s_cselect_b32 s5, s4, s5 +; GFX6-NEXT: s_cselect_b32 s4, s14, s6 ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11] ; GFX6-NEXT: s_sub_u32 s4, s4, s10 ; GFX6-NEXT: s_subb_u32 s5, s5, s10 @@ -9181,8 +9219,8 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_sub_u32 s4, 0, s6 -; GFX9-NEXT: s_subb_u32 s5, 0, s7 +; GFX9-NEXT: s_sub_u32 s8, 0, s6 +; GFX9-NEXT: s_subb_u32 s9, 0, s7 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -9192,52 +9230,56 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v1 -; GFX9-NEXT: s_mul_i32 s10, s4, s8 -; GFX9-NEXT: s_mul_hi_u32 s12, s4, s9 -; GFX9-NEXT: s_mul_i32 s11, s5, s9 -; GFX9-NEXT: s_add_i32 s10, s12, s10 -; GFX9-NEXT: s_mul_i32 s13, s4, s9 -; GFX9-NEXT: s_add_i32 s10, s10, s11 -; GFX9-NEXT: s_mul_hi_u32 s12, s9, s13 -; GFX9-NEXT: s_mul_i32 s14, s9, s10 -; GFX9-NEXT: s_mul_hi_u32 s11, s9, s10 +; GFX9-NEXT: v_readfirstlane_b32 s10, v2 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_mul_i32 s5, s8, s10 +; GFX9-NEXT: s_mul_hi_u32 s12, s8, s4 +; GFX9-NEXT: s_mul_i32 s11, s9, s4 +; GFX9-NEXT: s_add_i32 s5, s12, s5 +; GFX9-NEXT: s_mul_i32 s13, s8, s4 +; GFX9-NEXT: s_add_i32 s5, s5, s11 +; GFX9-NEXT: s_mul_hi_u32 s12, s4, s13 +; GFX9-NEXT: s_mul_i32 s14, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s11, s4, s5 ; GFX9-NEXT: s_add_u32 s12, s12, s14 ; GFX9-NEXT: s_addc_u32 s11, 0, s11 -; GFX9-NEXT: s_mul_hi_u32 s15, s8, s13 -; GFX9-NEXT: s_mul_i32 s13, s8, s13 +; GFX9-NEXT: s_mul_hi_u32 s15, s10, s13 +; GFX9-NEXT: s_mul_i32 s13, s10, s13 ; GFX9-NEXT: s_add_u32 s12, s12, s13 -; GFX9-NEXT: s_mul_hi_u32 s14, s8, s10 +; GFX9-NEXT: s_mul_hi_u32 s14, s10, s5 ; GFX9-NEXT: s_addc_u32 s11, s11, s15 ; GFX9-NEXT: s_addc_u32 s12, s14, 0 -; GFX9-NEXT: s_mul_i32 s10, s8, s10 -; GFX9-NEXT: s_add_u32 s10, s11, s10 +; GFX9-NEXT: s_mul_i32 s5, s10, s5 +; GFX9-NEXT: s_add_u32 s5, s11, s5 ; GFX9-NEXT: s_addc_u32 s11, 0, s12 -; GFX9-NEXT: s_add_u32 s9, s9, s10 -; GFX9-NEXT: s_addc_u32 s8, s8, s11 -; GFX9-NEXT: s_mul_i32 s10, s4, s8 -; GFX9-NEXT: s_mul_hi_u32 s11, s4, s9 -; GFX9-NEXT: s_add_i32 s10, s11, s10 -; GFX9-NEXT: s_mul_i32 s5, s5, s9 -; GFX9-NEXT: s_add_i32 s10, s10, s5 -; GFX9-NEXT: s_mul_i32 s4, s4, s9 -; GFX9-NEXT: s_mul_hi_u32 s11, s8, s4 -; GFX9-NEXT: s_mul_i32 s12, s8, s4 -; GFX9-NEXT: s_mul_i32 s14, s9, s10 -; GFX9-NEXT: s_mul_hi_u32 s4, s9, s4 -; GFX9-NEXT: s_mul_hi_u32 s13, s9, s10 -; GFX9-NEXT: s_add_u32 s4, s4, s14 +; GFX9-NEXT: s_add_u32 s12, s4, s5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_addc_u32 s10, s10, s11 +; GFX9-NEXT: s_mul_i32 s4, s8, s10 +; GFX9-NEXT: s_mul_hi_u32 s5, s8, s12 +; GFX9-NEXT: s_add_i32 s4, s5, s4 +; GFX9-NEXT: s_mul_i32 s9, s9, s12 +; GFX9-NEXT: s_add_i32 s4, s4, s9 +; GFX9-NEXT: s_mul_i32 s8, s8, s12 +; GFX9-NEXT: s_mul_hi_u32 s9, s10, s8 +; GFX9-NEXT: s_mul_i32 s11, s10, s8 +; GFX9-NEXT: s_mul_i32 s14, s12, s4 +; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8 +; GFX9-NEXT: s_mul_hi_u32 s13, s12, s4 +; GFX9-NEXT: s_add_u32 s8, s8, s14 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_add_u32 s4, s4, s12 -; GFX9-NEXT: s_mul_hi_u32 s5, s8, s10 -; GFX9-NEXT: s_addc_u32 s4, s13, s11 +; GFX9-NEXT: s_add_u32 s8, s8, s11 +; GFX9-NEXT: s_mul_hi_u32 s5, s10, s4 +; GFX9-NEXT: s_addc_u32 s8, s13, s9 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_mul_i32 s10, s8, s10 -; GFX9-NEXT: s_add_u32 s4, s4, s10 -; GFX9-NEXT: s_addc_u32 s5, 0, s5 -; GFX9-NEXT: s_add_u32 s9, s9, s4 -; GFX9-NEXT: s_addc_u32 s8, s8, s5 +; GFX9-NEXT: s_mul_i32 s4, s10, s4 +; GFX9-NEXT: s_add_u32 s4, s8, s4 +; GFX9-NEXT: s_addc_u32 s8, 0, s5 +; GFX9-NEXT: s_add_u32 s9, s12, s4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_addc_u32 s8, s10, s8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: s_add_u32 s2, s2, s4 @@ -9267,9 +9309,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_mul_i32 s8, s6, s8 ; GFX9-NEXT: s_sub_u32 s2, s2, s8 ; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX9-NEXT: s_subb_u32 s13, s10, s7 ; GFX9-NEXT: s_sub_u32 s14, s2, s6 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 ; GFX9-NEXT: s_subb_u32 s15, s13, 0 ; GFX9-NEXT: s_cmp_ge_u32 s15, s7 ; GFX9-NEXT: s_cselect_b32 s16, -1, 0 @@ -9278,11 +9322,13 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_cmp_eq_u32 s15, s7 ; GFX9-NEXT: s_cselect_b32 s16, s17, s16 ; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s10, s13, s7 -; GFX9-NEXT: s_sub_u32 s11, s14, s6 -; GFX9-NEXT: s_subb_u32 s10, s10, 0 +; GFX9-NEXT: s_subb_u32 s13, s13, s7 +; GFX9-NEXT: s_sub_u32 s17, s14, s6 +; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GFX9-NEXT: s_subb_u32 s10, s13, 0 ; GFX9-NEXT: s_cmp_lg_u32 s16, 0 -; GFX9-NEXT: s_cselect_b32 s11, s11, s14 +; GFX9-NEXT: s_cselect_b32 s11, s17, s14 ; GFX9-NEXT: s_cselect_b32 s10, s10, s15 ; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX9-NEXT: s_subb_u32 s3, s3, s12 @@ -9444,9 +9490,10 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_addc_u32 s15, 0, s16 ; GFX6-NEXT: s_add_u32 s16, s6, s7 ; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 ; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 ; GFX6-NEXT: s_or_b32 s6, s6, s7 +; GFX6-NEXT: s_cmp_lg_u32 s6, 0 ; GFX6-NEXT: s_addc_u32 s14, s14, s15 ; GFX6-NEXT: s_mul_i32 s6, s12, s14 ; GFX6-NEXT: v_readfirstlane_b32 s7, v0 @@ -9477,6 +9524,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_add_u32 s13, s16, s6 ; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX6-NEXT: s_or_b32 s6, s6, s7 +; GFX6-NEXT: s_cmp_lg_u32 s6, 0 ; GFX6-NEXT: s_addc_u32 s12, s14, s12 ; GFX6-NEXT: s_ashr_i32 s6, s9, 31 ; GFX6-NEXT: s_add_u32 s8, s8, s6 @@ -9509,46 +9557,49 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_readfirstlane_b32 s14, v0 ; GFX6-NEXT: s_add_i32 s13, s14, s13 ; GFX6-NEXT: s_mul_i32 s14, s3, s12 -; GFX6-NEXT: s_add_i32 s16, s13, s14 -; GFX6-NEXT: s_sub_i32 s14, s9, s16 +; GFX6-NEXT: s_add_i32 s14, s13, s14 +; GFX6-NEXT: s_sub_i32 s15, s9, s14 ; GFX6-NEXT: s_mul_i32 s12, s2, s12 ; GFX6-NEXT: s_sub_u32 s8, s8, s12 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s15, s12, s13 -; GFX6-NEXT: s_subb_u32 s17, s14, s3 -; GFX6-NEXT: s_sub_u32 s18, s8, s2 -; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s19, s14, s15 -; GFX6-NEXT: s_subb_u32 s19, s17, 0 -; GFX6-NEXT: s_cmp_ge_u32 s19, s3 -; GFX6-NEXT: s_cselect_b32 s20, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s18, s2 -; GFX6-NEXT: s_cselect_b32 s21, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s19, s3 -; GFX6-NEXT: s_cselect_b32 s20, s21, s20 -; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_subb_u32 s17, s17, s3 -; GFX6-NEXT: s_sub_u32 s21, s18, s2 -; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_subb_u32 s14, s17, 0 -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_cselect_b32 s15, s21, s18 -; GFX6-NEXT: s_cselect_b32 s14, s14, s19 +; GFX6-NEXT: s_or_b32 s16, s12, s13 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_subb_u32 s15, s15, s3 +; GFX6-NEXT: s_sub_u32 s17, s8, s2 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_subb_u32 s18, s15, 0 +; GFX6-NEXT: s_cmp_ge_u32 s18, s3 +; GFX6-NEXT: s_cselect_b32 s13, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s17, s2 +; GFX6-NEXT: s_cselect_b32 s19, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s18, s3 +; GFX6-NEXT: s_cselect_b32 s19, s19, s13 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_subb_u32 s15, s15, s3 +; GFX6-NEXT: s_sub_u32 s20, s17, s2 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s9, s9, s16 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_subb_u32 s12, s15, 0 +; GFX6-NEXT: s_cmp_lg_u32 s19, 0 +; GFX6-NEXT: s_cselect_b32 s13, s20, s17 +; GFX6-NEXT: s_cselect_b32 s12, s12, s18 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_subb_u32 s9, s9, s14 ; GFX6-NEXT: s_cmp_ge_u32 s9, s3 -; GFX6-NEXT: s_cselect_b32 s12, -1, 0 +; GFX6-NEXT: s_cselect_b32 s14, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s8, s2 ; GFX6-NEXT: s_cselect_b32 s2, -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s9, s3 -; GFX6-NEXT: s_cselect_b32 s2, s2, s12 +; GFX6-NEXT: s_cselect_b32 s2, s2, s14 ; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_cselect_b32 s3, s14, s9 -; GFX6-NEXT: s_cselect_b32 s2, s15, s8 +; GFX6-NEXT: s_cselect_b32 s3, s12, s9 +; GFX6-NEXT: s_cselect_b32 s2, s13, s8 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] -; GFX6-NEXT: s_sub_u32 s14, s2, s6 -; GFX6-NEXT: s_subb_u32 s15, s3, s6 +; GFX6-NEXT: s_sub_u32 s12, s2, s6 +; GFX6-NEXT: s_subb_u32 s13, s3, s6 ; GFX6-NEXT: s_ashr_i32 s2, s1, 31 ; GFX6-NEXT: s_add_u32 s0, s0, s2 ; GFX6-NEXT: s_mov_b32 s3, s2 @@ -9567,39 +9618,40 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s12, v1 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s1, s8, s12 +; GFX6-NEXT: s_mul_i32 s1, s8, s14 ; GFX6-NEXT: v_readfirstlane_b32 s3, v2 ; GFX6-NEXT: s_mul_i32 s0, s9, s2 ; GFX6-NEXT: s_add_i32 s1, s3, s1 ; GFX6-NEXT: s_add_i32 s3, s1, s0 -; GFX6-NEXT: s_mul_i32 s13, s8, s2 +; GFX6-NEXT: s_mul_i32 s15, s8, s2 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mul_i32 s4, s2, s3 ; GFX6-NEXT: v_readfirstlane_b32 s5, v2 ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, v1, s13 +; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15 ; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3 ; GFX6-NEXT: s_add_u32 s4, s16, s4 ; GFX6-NEXT: s_addc_u32 s5, 0, s5 -; GFX6-NEXT: s_mul_i32 s13, s12, s13 +; GFX6-NEXT: s_mul_i32 s15, s14, s15 ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 -; GFX6-NEXT: s_add_u32 s4, s4, s13 +; GFX6-NEXT: s_add_u32 s4, s4, s15 ; GFX6-NEXT: s_addc_u32 s4, s5, s16 ; GFX6-NEXT: v_readfirstlane_b32 s5, v1 ; GFX6-NEXT: s_addc_u32 s5, s5, 0 -; GFX6-NEXT: s_mul_i32 s3, s12, s3 +; GFX6-NEXT: s_mul_i32 s3, s14, s3 ; GFX6-NEXT: s_add_u32 s3, s4, s3 ; GFX6-NEXT: s_addc_u32 s4, 0, s5 ; GFX6-NEXT: s_add_u32 s5, s2, s3 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_addc_u32 s4, s12, s4 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: s_addc_u32 s4, s14, s4 ; GFX6-NEXT: s_mul_i32 s2, s8, s4 ; GFX6-NEXT: v_readfirstlane_b32 s3, v0 ; GFX6-NEXT: s_add_i32 s2, s3, s2 @@ -9613,98 +9665,102 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: s_mul_i32 s9, s5, s2 -; GFX6-NEXT: v_readfirstlane_b32 s13, v2 -; GFX6-NEXT: s_add_u32 s9, s13, s9 -; GFX6-NEXT: v_readfirstlane_b32 s12, v0 +; GFX6-NEXT: v_readfirstlane_b32 s15, v2 +; GFX6-NEXT: s_add_u32 s9, s15, s9 +; GFX6-NEXT: v_readfirstlane_b32 s14, v0 ; GFX6-NEXT: s_mul_i32 s3, s4, s3 -; GFX6-NEXT: s_addc_u32 s12, 0, s12 +; GFX6-NEXT: s_addc_u32 s14, 0, s14 ; GFX6-NEXT: v_readfirstlane_b32 s8, v3 ; GFX6-NEXT: s_add_u32 s3, s9, s3 -; GFX6-NEXT: s_addc_u32 s3, s12, s8 +; GFX6-NEXT: s_addc_u32 s3, s14, s8 ; GFX6-NEXT: v_readfirstlane_b32 s8, v1 ; GFX6-NEXT: s_addc_u32 s8, s8, 0 ; GFX6-NEXT: s_mul_i32 s2, s4, s2 ; GFX6-NEXT: s_add_u32 s2, s3, s2 ; GFX6-NEXT: s_addc_u32 s8, 0, s8 -; GFX6-NEXT: s_add_u32 s12, s5, s2 +; GFX6-NEXT: s_add_u32 s14, s5, s2 ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_addc_u32 s13, s4, s8 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: s_addc_u32 s15, s4, s8 ; GFX6-NEXT: s_ashr_i32 s4, s11, 31 ; GFX6-NEXT: s_add_u32 s2, s10, s4 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_addc_u32 s3, s11, s4 ; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, s13 +; GFX6-NEXT: v_mov_b32_e32 v0, s15 ; GFX6-NEXT: v_mul_hi_u32 v1, s8, v0 -; GFX6-NEXT: v_mov_b32_e32 v2, s12 +; GFX6-NEXT: v_mov_b32_e32 v2, s14 ; GFX6-NEXT: v_mul_hi_u32 v3, s8, v2 -; GFX6-NEXT: s_mul_i32 s2, s8, s13 +; GFX6-NEXT: s_mul_i32 s2, s8, s15 ; GFX6-NEXT: v_readfirstlane_b32 s10, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s9, v2 ; GFX6-NEXT: v_readfirstlane_b32 s11, v3 ; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 ; GFX6-NEXT: s_add_u32 s2, s11, s2 ; GFX6-NEXT: s_addc_u32 s10, 0, s10 -; GFX6-NEXT: s_mul_i32 s11, s9, s12 -; GFX6-NEXT: v_readfirstlane_b32 s12, v1 +; GFX6-NEXT: s_mul_i32 s11, s9, s14 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 ; GFX6-NEXT: s_add_u32 s2, s2, s11 -; GFX6-NEXT: s_addc_u32 s2, s10, s12 +; GFX6-NEXT: s_addc_u32 s2, s10, s14 ; GFX6-NEXT: v_readfirstlane_b32 s10, v0 ; GFX6-NEXT: s_addc_u32 s10, s10, 0 -; GFX6-NEXT: s_mul_i32 s11, s9, s13 +; GFX6-NEXT: s_mul_i32 s11, s9, s15 ; GFX6-NEXT: s_add_u32 s11, s2, s11 ; GFX6-NEXT: v_mov_b32_e32 v0, s11 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX6-NEXT: s_addc_u32 s10, 0, s10 ; GFX6-NEXT: s_mul_i32 s10, s6, s10 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: v_readfirstlane_b32 s12, v0 -; GFX6-NEXT: s_add_i32 s10, s12, s10 -; GFX6-NEXT: s_mul_i32 s12, s7, s11 -; GFX6-NEXT: s_add_i32 s16, s10, s12 -; GFX6-NEXT: s_sub_i32 s12, s9, s16 +; GFX6-NEXT: v_readfirstlane_b32 s14, v0 +; GFX6-NEXT: s_add_i32 s10, s14, s10 +; GFX6-NEXT: s_mul_i32 s14, s7, s11 +; GFX6-NEXT: s_add_i32 s14, s10, s14 +; GFX6-NEXT: s_sub_i32 s15, s9, s14 ; GFX6-NEXT: s_mul_i32 s10, s6, s11 ; GFX6-NEXT: s_sub_u32 s8, s8, s10 ; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX6-NEXT: s_or_b32 s13, s10, s11 -; GFX6-NEXT: s_subb_u32 s17, s12, s7 -; GFX6-NEXT: s_sub_u32 s18, s8, s6 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s19, s12, s13 -; GFX6-NEXT: s_subb_u32 s19, s17, 0 -; GFX6-NEXT: s_cmp_ge_u32 s19, s7 -; GFX6-NEXT: s_cselect_b32 s20, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s18, s6 -; GFX6-NEXT: s_cselect_b32 s21, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s19, s7 -; GFX6-NEXT: s_cselect_b32 s20, s21, s20 -; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s17, s17, s7 -; GFX6-NEXT: s_sub_u32 s21, s18, s6 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s12, s17, 0 -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_cselect_b32 s13, s21, s18 -; GFX6-NEXT: s_cselect_b32 s12, s12, s19 +; GFX6-NEXT: s_or_b32 s16, s10, s11 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_subb_u32 s15, s15, s7 +; GFX6-NEXT: s_sub_u32 s17, s8, s6 +; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GFX6-NEXT: s_or_b32 s10, s10, s11 -; GFX6-NEXT: s_subb_u32 s9, s9, s16 +; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_subb_u32 s18, s15, 0 +; GFX6-NEXT: s_cmp_ge_u32 s18, s7 +; GFX6-NEXT: s_cselect_b32 s11, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s17, s6 +; GFX6-NEXT: s_cselect_b32 s19, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s18, s7 +; GFX6-NEXT: s_cselect_b32 s19, s19, s11 +; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_subb_u32 s15, s15, s7 +; GFX6-NEXT: s_sub_u32 s20, s17, s6 +; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX6-NEXT: s_or_b32 s10, s10, s11 +; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_subb_u32 s10, s15, 0 +; GFX6-NEXT: s_cmp_lg_u32 s19, 0 +; GFX6-NEXT: s_cselect_b32 s11, s20, s17 +; GFX6-NEXT: s_cselect_b32 s10, s10, s18 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_subb_u32 s9, s9, s14 ; GFX6-NEXT: s_cmp_ge_u32 s9, s7 -; GFX6-NEXT: s_cselect_b32 s10, -1, 0 +; GFX6-NEXT: s_cselect_b32 s14, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s8, s6 ; GFX6-NEXT: s_cselect_b32 s6, -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s9, s7 -; GFX6-NEXT: s_cselect_b32 s6, s6, s10 +; GFX6-NEXT: s_cselect_b32 s6, s6, s14 ; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_cselect_b32 s7, s12, s9 -; GFX6-NEXT: s_cselect_b32 s6, s13, s8 +; GFX6-NEXT: s_cselect_b32 s7, s10, s9 +; GFX6-NEXT: s_cselect_b32 s6, s11, s8 ; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GFX6-NEXT: s_sub_u32 s5, s6, s4 ; GFX6-NEXT: s_subb_u32 s4, s7, s4 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v0, s14 -; GFX6-NEXT: v_mov_b32_e32 v1, s15 +; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mov_b32_e32 v1, s13 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: v_mov_b32_e32 v3, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9724,8 +9780,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX9-NEXT: s_sub_u32 s6, 0, s2 -; GFX9-NEXT: s_subb_u32 s7, 0, s3 +; GFX9-NEXT: s_sub_u32 s12, 0, s2 +; GFX9-NEXT: s_subb_u32 s13, 0, s3 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -9734,52 +9790,56 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s12, v1 -; GFX9-NEXT: v_readfirstlane_b32 s13, v0 -; GFX9-NEXT: s_mul_i32 s14, s6, s12 -; GFX9-NEXT: s_mul_hi_u32 s16, s6, s13 -; GFX9-NEXT: s_mul_i32 s15, s7, s13 -; GFX9-NEXT: s_add_i32 s14, s16, s14 -; GFX9-NEXT: s_mul_i32 s17, s6, s13 -; GFX9-NEXT: s_add_i32 s14, s14, s15 -; GFX9-NEXT: s_mul_hi_u32 s16, s13, s17 -; GFX9-NEXT: s_mul_i32 s18, s13, s14 -; GFX9-NEXT: s_mul_hi_u32 s15, s13, s14 +; GFX9-NEXT: v_readfirstlane_b32 s14, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: s_mul_i32 s7, s12, s14 +; GFX9-NEXT: s_mul_hi_u32 s16, s12, s6 +; GFX9-NEXT: s_mul_i32 s15, s13, s6 +; GFX9-NEXT: s_add_i32 s7, s16, s7 +; GFX9-NEXT: s_mul_i32 s17, s12, s6 +; GFX9-NEXT: s_add_i32 s7, s7, s15 +; GFX9-NEXT: s_mul_hi_u32 s16, s6, s17 +; GFX9-NEXT: s_mul_i32 s18, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s15, s6, s7 ; GFX9-NEXT: s_add_u32 s16, s16, s18 ; GFX9-NEXT: s_addc_u32 s15, 0, s15 -; GFX9-NEXT: s_mul_hi_u32 s18, s12, s17 -; GFX9-NEXT: s_mul_i32 s17, s12, s17 +; GFX9-NEXT: s_mul_hi_u32 s18, s14, s17 +; GFX9-NEXT: s_mul_i32 s17, s14, s17 ; GFX9-NEXT: s_add_u32 s16, s16, s17 -; GFX9-NEXT: s_mul_hi_u32 s19, s12, s14 +; GFX9-NEXT: s_mul_hi_u32 s19, s14, s7 ; GFX9-NEXT: s_addc_u32 s15, s15, s18 ; GFX9-NEXT: s_addc_u32 s16, s19, 0 -; GFX9-NEXT: s_mul_i32 s14, s12, s14 -; GFX9-NEXT: s_add_u32 s14, s15, s14 +; GFX9-NEXT: s_mul_i32 s7, s14, s7 +; GFX9-NEXT: s_add_u32 s7, s15, s7 ; GFX9-NEXT: s_addc_u32 s15, 0, s16 -; GFX9-NEXT: s_add_u32 s13, s13, s14 -; GFX9-NEXT: s_addc_u32 s12, s12, s15 -; GFX9-NEXT: s_mul_i32 s14, s6, s12 -; GFX9-NEXT: s_mul_hi_u32 s15, s6, s13 -; GFX9-NEXT: s_add_i32 s14, s15, s14 -; GFX9-NEXT: s_mul_i32 s7, s7, s13 -; GFX9-NEXT: s_add_i32 s14, s14, s7 -; GFX9-NEXT: s_mul_i32 s6, s6, s13 -; GFX9-NEXT: s_mul_hi_u32 s15, s12, s6 -; GFX9-NEXT: s_mul_i32 s16, s12, s6 -; GFX9-NEXT: s_mul_i32 s18, s13, s14 -; GFX9-NEXT: s_mul_hi_u32 s6, s13, s6 -; GFX9-NEXT: s_mul_hi_u32 s17, s13, s14 -; GFX9-NEXT: s_add_u32 s6, s6, s18 +; GFX9-NEXT: s_add_u32 s16, s6, s7 +; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX9-NEXT: s_addc_u32 s14, s14, s15 +; GFX9-NEXT: s_mul_i32 s6, s12, s14 +; GFX9-NEXT: s_mul_hi_u32 s7, s12, s16 +; GFX9-NEXT: s_add_i32 s6, s7, s6 +; GFX9-NEXT: s_mul_i32 s13, s13, s16 +; GFX9-NEXT: s_add_i32 s6, s6, s13 +; GFX9-NEXT: s_mul_i32 s12, s12, s16 +; GFX9-NEXT: s_mul_hi_u32 s13, s14, s12 +; GFX9-NEXT: s_mul_i32 s15, s14, s12 +; GFX9-NEXT: s_mul_i32 s18, s16, s6 +; GFX9-NEXT: s_mul_hi_u32 s12, s16, s12 +; GFX9-NEXT: s_mul_hi_u32 s17, s16, s6 +; GFX9-NEXT: s_add_u32 s12, s12, s18 ; GFX9-NEXT: s_addc_u32 s17, 0, s17 -; GFX9-NEXT: s_add_u32 s6, s6, s16 -; GFX9-NEXT: s_mul_hi_u32 s7, s12, s14 -; GFX9-NEXT: s_addc_u32 s6, s17, s15 +; GFX9-NEXT: s_add_u32 s12, s12, s15 +; GFX9-NEXT: s_mul_hi_u32 s7, s14, s6 +; GFX9-NEXT: s_addc_u32 s12, s17, s13 ; GFX9-NEXT: s_addc_u32 s7, s7, 0 -; GFX9-NEXT: s_mul_i32 s14, s12, s14 -; GFX9-NEXT: s_add_u32 s6, s6, s14 -; GFX9-NEXT: s_addc_u32 s7, 0, s7 -; GFX9-NEXT: s_add_u32 s13, s13, s6 -; GFX9-NEXT: s_addc_u32 s12, s12, s7 +; GFX9-NEXT: s_mul_i32 s6, s14, s6 +; GFX9-NEXT: s_add_u32 s6, s12, s6 +; GFX9-NEXT: s_addc_u32 s12, 0, s7 +; GFX9-NEXT: s_add_u32 s13, s16, s6 +; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX9-NEXT: s_addc_u32 s12, s14, s12 ; GFX9-NEXT: s_ashr_i32 s6, s9, 31 ; GFX9-NEXT: s_add_u32 s8, s8, s6 ; GFX9-NEXT: s_mov_b32 s7, s6 @@ -9808,9 +9868,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_i32 s12, s2, s12 ; GFX9-NEXT: s_sub_u32 s8, s8, s12 ; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 ; GFX9-NEXT: s_subb_u32 s17, s14, s3 ; GFX9-NEXT: s_sub_u32 s18, s8, s2 ; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 ; GFX9-NEXT: s_subb_u32 s19, s17, 0 ; GFX9-NEXT: s_cmp_ge_u32 s19, s3 ; GFX9-NEXT: s_cselect_b32 s20, -1, 0 @@ -9819,11 +9881,13 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cmp_eq_u32 s19, s3 ; GFX9-NEXT: s_cselect_b32 s20, s21, s20 ; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GFX9-NEXT: s_subb_u32 s14, s17, s3 -; GFX9-NEXT: s_sub_u32 s15, s18, s2 -; GFX9-NEXT: s_subb_u32 s14, s14, 0 +; GFX9-NEXT: s_subb_u32 s17, s17, s3 +; GFX9-NEXT: s_sub_u32 s21, s18, s2 +; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GFX9-NEXT: s_subb_u32 s14, s17, 0 ; GFX9-NEXT: s_cmp_lg_u32 s20, 0 -; GFX9-NEXT: s_cselect_b32 s15, s15, s18 +; GFX9-NEXT: s_cselect_b32 s15, s21, s18 ; GFX9-NEXT: s_cselect_b32 s14, s14, s19 ; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 ; GFX9-NEXT: s_subb_u32 s9, s9, s16 @@ -9847,8 +9911,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_sub_u32 s4, 0, s2 -; GFX9-NEXT: s_subb_u32 s5, 0, s3 +; GFX9-NEXT: s_sub_u32 s6, 0, s2 +; GFX9-NEXT: s_subb_u32 s7, 0, s3 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -9858,70 +9922,74 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_readfirstlane_b32 s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: v_readfirstlane_b32 s9, v2 -; GFX9-NEXT: s_mul_hi_u32 s8, s4, s6 -; GFX9-NEXT: s_mul_i32 s14, s4, s9 -; GFX9-NEXT: s_mul_i32 s7, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s8, s6, s4 +; GFX9-NEXT: s_mul_i32 s14, s6, s9 +; GFX9-NEXT: s_mul_i32 s5, s7, s4 ; GFX9-NEXT: s_add_i32 s8, s8, s14 -; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_i32 s15, s4, s6 -; GFX9-NEXT: s_mul_i32 s14, s6, s8 -; GFX9-NEXT: s_mul_hi_u32 s16, s6, s15 -; GFX9-NEXT: s_mul_hi_u32 s7, s6, s8 +; GFX9-NEXT: s_add_i32 s8, s8, s5 +; GFX9-NEXT: s_mul_i32 s15, s6, s4 +; GFX9-NEXT: s_mul_i32 s14, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s16, s4, s15 +; GFX9-NEXT: s_mul_hi_u32 s5, s4, s8 ; GFX9-NEXT: s_add_u32 s14, s16, s14 -; GFX9-NEXT: s_addc_u32 s7, 0, s7 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 ; GFX9-NEXT: s_mul_hi_u32 s17, s9, s15 ; GFX9-NEXT: s_mul_i32 s15, s9, s15 ; GFX9-NEXT: s_add_u32 s14, s14, s15 ; GFX9-NEXT: s_mul_hi_u32 s16, s9, s8 -; GFX9-NEXT: s_addc_u32 s7, s7, s17 +; GFX9-NEXT: s_addc_u32 s5, s5, s17 ; GFX9-NEXT: s_addc_u32 s14, s16, 0 ; GFX9-NEXT: s_mul_i32 s8, s9, s8 -; GFX9-NEXT: s_add_u32 s7, s7, s8 +; GFX9-NEXT: s_add_u32 s5, s5, s8 ; GFX9-NEXT: s_addc_u32 s8, 0, s14 -; GFX9-NEXT: s_add_u32 s6, s6, s7 -; GFX9-NEXT: s_addc_u32 s7, s9, s8 -; GFX9-NEXT: s_mul_i32 s8, s4, s7 -; GFX9-NEXT: s_mul_hi_u32 s9, s4, s6 -; GFX9-NEXT: s_add_i32 s8, s9, s8 -; GFX9-NEXT: s_mul_i32 s5, s5, s6 -; GFX9-NEXT: s_add_i32 s8, s8, s5 -; GFX9-NEXT: s_mul_i32 s4, s4, s6 -; GFX9-NEXT: s_mul_hi_u32 s9, s7, s4 -; GFX9-NEXT: s_mul_i32 s14, s7, s4 -; GFX9-NEXT: s_mul_i32 s16, s6, s8 -; GFX9-NEXT: s_mul_hi_u32 s4, s6, s4 -; GFX9-NEXT: s_mul_hi_u32 s15, s6, s8 -; GFX9-NEXT: s_add_u32 s4, s4, s16 +; GFX9-NEXT: s_add_u32 s14, s4, s5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_addc_u32 s8, s9, s8 +; GFX9-NEXT: s_mul_i32 s4, s6, s8 +; GFX9-NEXT: s_mul_hi_u32 s5, s6, s14 +; GFX9-NEXT: s_add_i32 s4, s5, s4 +; GFX9-NEXT: s_mul_i32 s7, s7, s14 +; GFX9-NEXT: s_add_i32 s4, s4, s7 +; GFX9-NEXT: s_mul_i32 s6, s6, s14 +; GFX9-NEXT: s_mul_hi_u32 s7, s8, s6 +; GFX9-NEXT: s_mul_i32 s9, s8, s6 +; GFX9-NEXT: s_mul_i32 s16, s14, s4 +; GFX9-NEXT: s_mul_hi_u32 s6, s14, s6 +; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4 +; GFX9-NEXT: s_add_u32 s6, s6, s16 ; GFX9-NEXT: s_addc_u32 s15, 0, s15 -; GFX9-NEXT: s_add_u32 s4, s4, s14 -; GFX9-NEXT: s_mul_hi_u32 s5, s7, s8 -; GFX9-NEXT: s_addc_u32 s4, s15, s9 +; GFX9-NEXT: s_add_u32 s6, s6, s9 +; GFX9-NEXT: s_mul_hi_u32 s5, s8, s4 +; GFX9-NEXT: s_addc_u32 s6, s15, s7 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_mul_i32 s8, s7, s8 -; GFX9-NEXT: s_add_u32 s4, s4, s8 -; GFX9-NEXT: s_addc_u32 s5, 0, s5 -; GFX9-NEXT: s_add_u32 s8, s6, s4 -; GFX9-NEXT: s_addc_u32 s9, s7, s5 +; GFX9-NEXT: s_mul_i32 s4, s8, s4 +; GFX9-NEXT: s_add_u32 s4, s6, s4 +; GFX9-NEXT: s_addc_u32 s6, 0, s5 +; GFX9-NEXT: s_add_u32 s9, s14, s4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_addc_u32 s8, s8, s6 ; GFX9-NEXT: s_ashr_i32 s4, s11, 31 ; GFX9-NEXT: s_add_u32 s6, s10, s4 ; GFX9-NEXT: s_mov_b32 s5, s4 ; GFX9-NEXT: s_addc_u32 s7, s11, s4 ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] -; GFX9-NEXT: s_mul_i32 s11, s6, s9 -; GFX9-NEXT: s_mul_hi_u32 s14, s6, s8 -; GFX9-NEXT: s_mul_hi_u32 s10, s6, s9 +; GFX9-NEXT: s_mul_i32 s11, s6, s8 +; GFX9-NEXT: s_mul_hi_u32 s14, s6, s9 +; GFX9-NEXT: s_mul_hi_u32 s10, s6, s8 ; GFX9-NEXT: s_add_u32 s11, s14, s11 ; GFX9-NEXT: s_addc_u32 s10, 0, s10 -; GFX9-NEXT: s_mul_hi_u32 s15, s7, s8 -; GFX9-NEXT: s_mul_i32 s8, s7, s8 -; GFX9-NEXT: s_add_u32 s8, s11, s8 -; GFX9-NEXT: s_mul_hi_u32 s14, s7, s9 -; GFX9-NEXT: s_addc_u32 s8, s10, s15 -; GFX9-NEXT: s_addc_u32 s10, s14, 0 +; GFX9-NEXT: s_mul_hi_u32 s15, s7, s9 ; GFX9-NEXT: s_mul_i32 s9, s7, s9 -; GFX9-NEXT: s_add_u32 s8, s8, s9 +; GFX9-NEXT: s_add_u32 s9, s11, s9 +; GFX9-NEXT: s_mul_hi_u32 s14, s7, s8 +; GFX9-NEXT: s_addc_u32 s9, s10, s15 +; GFX9-NEXT: s_addc_u32 s10, s14, 0 +; GFX9-NEXT: s_mul_i32 s8, s7, s8 +; GFX9-NEXT: s_add_u32 s8, s9, s8 ; GFX9-NEXT: s_addc_u32 s9, 0, s10 ; GFX9-NEXT: s_mul_i32 s9, s2, s9 ; GFX9-NEXT: s_mul_hi_u32 s10, s2, s8 @@ -9932,9 +10000,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_i32 s8, s2, s8 ; GFX9-NEXT: s_sub_u32 s6, s6, s8 ; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX9-NEXT: s_subb_u32 s15, s10, s3 ; GFX9-NEXT: s_sub_u32 s16, s6, s2 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 ; GFX9-NEXT: s_subb_u32 s17, s15, 0 ; GFX9-NEXT: s_cmp_ge_u32 s17, s3 ; GFX9-NEXT: s_cselect_b32 s18, -1, 0 @@ -9943,11 +10013,13 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cmp_eq_u32 s17, s3 ; GFX9-NEXT: s_cselect_b32 s18, s19, s18 ; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s10, s15, s3 -; GFX9-NEXT: s_sub_u32 s11, s16, s2 -; GFX9-NEXT: s_subb_u32 s10, s10, 0 +; GFX9-NEXT: s_subb_u32 s15, s15, s3 +; GFX9-NEXT: s_sub_u32 s19, s16, s2 +; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GFX9-NEXT: s_subb_u32 s10, s15, 0 ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cselect_b32 s11, s11, s16 +; GFX9-NEXT: s_cselect_b32 s11, s19, s16 ; GFX9-NEXT: s_cselect_b32 s10, s10, s17 ; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX9-NEXT: s_subb_u32 s7, s7, s14 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index 01f4414..394727c 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -612,11 +612,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8-NEXT: s_add_i32 s2, s2, s8 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -652,11 +653,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9-NEXT: s_add_i32 s2, s2, s8 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -691,10 +693,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -730,10 +733,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 +; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -770,10 +774,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -813,10 +818,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -853,10 +859,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -894,15 +901,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -992,11 +999,12 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8-NEXT: s_add_i32 s2, s2, s8 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1034,11 +1042,12 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9-NEXT: s_add_i32 s2, s2, s8 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1075,10 +1084,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1117,10 +1127,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 +; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1160,10 +1171,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1206,10 +1218,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1248,10 +1261,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1292,15 +1306,15 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB3_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2059,11 +2073,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8-NEXT: s_add_i32 s2, s2, s8 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2099,11 +2114,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9-NEXT: s_add_i32 s2, s2, s8 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2138,10 +2154,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2177,10 +2194,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 +; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2217,10 +2235,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2260,10 +2279,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2301,10 +2321,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2342,15 +2363,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 9db6d70..258bc295 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -717,11 +717,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 -; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 +; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -761,11 +762,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 +; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -803,12 +805,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -850,10 +853,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s6, s6, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -893,13 +897,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1164_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -944,10 +949,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s6, s6, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -987,14 +993,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1264_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 -; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s7 +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 ; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8 +; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1022,7 +1028,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xf1ff ; GFX1264_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null @@ -1036,15 +1041,15 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1232_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 -; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 ; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -2358,6 +2363,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -2410,6 +2416,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -2455,12 +2462,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s2 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s2 ; GFX1064_ITERATIVE-NEXT: s_add_u32 s6, s6, s3 ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -2507,12 +2515,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s1 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s1 ; GFX1032_ITERATIVE-NEXT: s_add_u32 s6, s6, s2 ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -2560,13 +2569,14 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s2 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s6, s6, s3 ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -2616,13 +2626,14 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s6, s6, s2 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -2666,16 +2677,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1264_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1] -; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8 -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8 -; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1] ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s10 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s10 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s10 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s10 ; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9] ; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] +; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -2720,17 +2731,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 -; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8 ; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe +; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -4479,11 +4490,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0 -; GFX8_ITERATIVE-NEXT: s_add_i32 s12, s12, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0 +; GFX8_ITERATIVE-NEXT: s_add_i32 s12, s12, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -4538,11 +4550,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s12, s12, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0 +; GFX9_ITERATIVE-NEXT: s_add_i32 s12, s12, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -4595,12 +4608,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s2 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1064_ITERATIVE-NEXT: s_add_i32 s12, s12, s6 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s6 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064_ITERATIVE-NEXT: s_add_i32 s12, s12, s7 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -4656,10 +4670,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s8, s1 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s8, s8, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -4713,13 +4728,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2 ; GFX1164_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s2 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164_ITERATIVE-NEXT: s_add_i32 s12, s12, s6 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s6 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164_ITERATIVE-NEXT: s_add_i32 s12, s12, s7 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -4783,10 +4799,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s8, s1 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s8, s8, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -4844,14 +4861,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1264_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 -; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s7 +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 ; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8 +; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -4879,7 +4896,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xf1ff ; GFX1264_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null @@ -4893,15 +4909,15 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1232_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 -; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 ; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -6657,6 +6673,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -6729,6 +6746,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -6794,12 +6812,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2 ; GFX1064_ITERATIVE-NEXT: s_add_u32 s8, s8, s3 ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s9, s9, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -6864,12 +6883,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s1 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1 ; GFX1032_ITERATIVE-NEXT: s_add_u32 s8, s8, s2 ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -6935,13 +6955,14 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s8, s8, s3 ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s9, s9, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -7015,13 +7036,14 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s1 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s8, s8, s2 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -7087,16 +7109,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1264_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1] -; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8 -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8 -; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1] ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s10 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s10 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s10 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s10 ; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9] ; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] +; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -7141,17 +7163,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 -; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8 ; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe +; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 6167a84..23c5f4f 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -499,11 +499,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -539,11 +540,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -578,10 +580,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -618,10 +621,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -659,10 +663,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -702,10 +707,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1082,10 +1088,11 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX8_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 -; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s4 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1110,10 +1117,11 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX9_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 -; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s4 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1139,8 +1147,9 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1167,8 +1176,9 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1196,8 +1206,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1227,8 +1239,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2008,6 +2022,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2056,6 +2071,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2096,12 +2112,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 ; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2143,12 +2160,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2191,13 +2209,14 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -2242,13 +2261,14 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -2861,6 +2881,7 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2893,6 +2914,7 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2924,6 +2946,7 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2956,6 +2979,7 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2989,6 +3013,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3022,8 +3048,9 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s4 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3879,11 +3906,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3919,11 +3947,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3958,10 +3987,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3998,10 +4028,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4039,10 +4070,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -4082,10 +4114,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -4462,10 +4495,11 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX8_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 -; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s4 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4490,10 +4524,11 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX9_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 -; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s4 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4519,8 +4554,9 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4547,8 +4583,9 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4576,8 +4613,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4607,8 +4646,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5411,6 +5452,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5459,6 +5501,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5499,12 +5542,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 ; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5546,12 +5590,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5594,13 +5639,14 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -5645,13 +5691,14 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -6266,11 +6313,12 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_and_b32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6306,11 +6354,12 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_and_b32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6345,10 +6394,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6385,10 +6435,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 ; GFX1032_ITERATIVE-NEXT: s_and_b32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6426,10 +6477,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -6469,10 +6521,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_and_b32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -6873,11 +6926,12 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX8_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6919,11 +6973,12 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX9_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6960,14 +7015,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 -; GFX1064_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7009,11 +7065,12 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 ; GFX1032_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7052,15 +7109,16 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 -; GFX1164_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] +; GFX1164_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -7105,11 +7163,12 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 ; GFX1132_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -7613,11 +7672,12 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_or_b32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7653,11 +7713,12 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_or_b32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7692,10 +7753,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7732,10 +7794,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 ; GFX1032_ITERATIVE-NEXT: s_or_b32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7773,10 +7836,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -7816,10 +7880,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_or_b32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -8219,11 +8284,12 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX8_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8265,11 +8331,12 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX9_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8306,14 +8373,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 -; GFX1064_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8355,11 +8423,12 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 ; GFX1032_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8398,15 +8467,16 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 -; GFX1164_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] +; GFX1164_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8451,11 +8521,12 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 ; GFX1132_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8959,11 +9030,12 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_xor_b32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8999,11 +9071,12 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_xor_b32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9038,10 +9111,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9078,10 +9152,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9119,10 +9194,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -9162,10 +9238,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -9565,11 +9642,12 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9611,11 +9689,12 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9652,14 +9731,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9701,11 +9781,12 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 ; GFX1032_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9744,15 +9825,16 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9797,11 +9879,12 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 ; GFX1132_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -10305,11 +10388,12 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_max_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10345,11 +10429,12 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_max_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10384,10 +10469,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10424,10 +10510,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 ; GFX1032_ITERATIVE-NEXT: s_max_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10465,10 +10552,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -10508,10 +10596,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_max_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -11166,6 +11255,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11221,6 +11311,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11272,6 +11363,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11323,6 +11415,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11375,8 +11468,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -11431,8 +11525,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -12119,11 +12214,12 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_min_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12159,11 +12255,12 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_min_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12198,10 +12295,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12238,10 +12336,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 ; GFX1032_ITERATIVE-NEXT: s_min_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12279,10 +12378,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -12322,10 +12422,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_min_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -12980,6 +13081,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -13035,6 +13137,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -13086,6 +13189,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -13137,6 +13241,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -13189,8 +13294,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -13245,8 +13351,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -13933,11 +14040,12 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_max_u32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -13973,11 +14081,12 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_max_u32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14012,10 +14121,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14052,10 +14162,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 ; GFX1032_ITERATIVE-NEXT: s_max_u32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14093,10 +14204,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -14136,10 +14248,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_max_u32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -14788,6 +14901,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14842,6 +14956,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14892,6 +15007,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14942,6 +15058,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14995,6 +15112,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -15050,6 +15169,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -15732,11 +15853,12 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_min_u32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15772,11 +15894,12 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_min_u32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15811,10 +15934,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15851,10 +15975,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 ; GFX1032_ITERATIVE-NEXT: s_min_u32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15892,10 +16017,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -15935,10 +16061,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_min_u32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -16588,6 +16715,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16642,6 +16770,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16692,6 +16821,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16742,6 +16872,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16795,6 +16926,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -16850,6 +16983,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index 9afc0c6..e4def28 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -611,11 +611,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8-NEXT: s_add_i32 s2, s2, s8 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -651,11 +652,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9-NEXT: s_add_i32 s2, s2, s8 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -690,10 +692,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -729,10 +732,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 +; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -769,10 +773,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -812,10 +817,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -852,10 +858,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -893,15 +900,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1658,11 +1665,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8-NEXT: s_add_i32 s2, s2, s8 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1698,11 +1706,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9-NEXT: s_add_i32 s2, s2, s8 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1737,10 +1746,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1776,10 +1786,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 +; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1816,10 +1827,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1859,10 +1871,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1900,10 +1913,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1941,15 +1955,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB6_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index 10fd34f..39a3c9a 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -628,11 +628,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8-NEXT: s_add_i32 s2, s2, s8 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -669,11 +670,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9-NEXT: s_add_i32 s2, s2, s8 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -709,10 +711,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -749,10 +752,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 +; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -790,10 +794,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -834,10 +839,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -874,10 +880,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -916,15 +923,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1826,11 +1833,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8-NEXT: s_add_i32 s2, s2, s8 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1867,11 +1875,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9-NEXT: s_add_i32 s2, s2, s8 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1907,10 +1916,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1947,10 +1957,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 +; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1988,10 +1999,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2032,10 +2044,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2073,10 +2086,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2115,15 +2129,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index b96de17..4a6fa4f 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -704,6 +704,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; CISI-NEXT: s_add_u32 s4, s4, s6 ; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0 ; CISI-NEXT: s_or_b32 s6, s12, s13 +; CISI-NEXT: s_cmp_lg_u32 s6, 0 ; CISI-NEXT: s_addc_u32 s5, s5, s7 ; CISI-NEXT: s_mov_b32 s8, s0 ; CISI-NEXT: s_mov_b32 s9, s1 @@ -724,14 +725,16 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_add_u32 s2, s4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_add_u32 s0, s4, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_addc_u32 s1, s5, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 +; VI-NEXT: s_addc_u32 s0, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v5, s0 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -743,10 +746,12 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s12, s14 -; GFX9-NEXT: s_addc_u32 s1, s13, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_add_u32 s2, s12, s14 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_addc_u32 s0, s13, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -759,8 +764,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_add_u32 s0, s12, s14 -; GFX1010-NEXT: s_addc_u32 s1, s13, s15 +; GFX1010-NEXT: s_cselect_b32 s1, -1, 0 ; GFX1010-NEXT: v_mov_b32_e32 v0, s0 +; GFX1010-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1010-NEXT: s_addc_u32 s1, s13, s15 ; GFX1010-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -774,8 +781,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_add_u32 s4, s4, s6 -; GFX1030W32-NEXT: s_addc_u32 s5, s5, s7 +; GFX1030W32-NEXT: s_cselect_b32 s6, -1, 0 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4 +; GFX1030W32-NEXT: s_cmp_lg_u32 s6, 0 +; GFX1030W32-NEXT: s_addc_u32 s5, s5, s7 ; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 @@ -789,8 +798,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_add_u32 s4, s4, s6 -; GFX1030W64-NEXT: s_addc_u32 s5, s5, s7 +; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX1030W64-NEXT: s_addc_u32 s5, s5, s7 ; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5 ; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] @@ -803,8 +814,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s4, s4, s6 -; GFX11-NEXT: s_addc_u32 s5, s5, s7 +; GFX11-NEXT: s_cselect_b32 s6, -1, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_cmp_lg_u32 s6, 0 +; GFX11-NEXT: s_addc_u32 s5, s5, s7 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 @@ -818,8 +831,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_add_co_u32 s0, s12, s14 -; GFX1250-NEXT: s_add_co_ci_u32 s1, s13, s15 +; GFX1250-NEXT: s_cselect_b32 s1, -1, 0 ; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1250-NEXT: s_add_co_ci_u32 s1, s13, s15 ; GFX1250-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -1676,6 +1691,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; CISI-NEXT: s_sub_u32 s4, s4, s6 ; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0 ; CISI-NEXT: s_or_b32 s6, s12, s13 +; CISI-NEXT: s_cmp_lg_u32 s6, 0 ; CISI-NEXT: s_subb_u32 s5, s5, s7 ; CISI-NEXT: s_mov_b32 s8, s0 ; CISI-NEXT: s_mov_b32 s9, s1 @@ -1696,14 +1712,16 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_sub_u32 s2, s4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_sub_u32 s0, s4, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_subb_u32 s1, s5, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 +; VI-NEXT: s_subb_u32 s0, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v5, s0 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -1715,10 +1733,12 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s0, s12, s14 -; GFX9-NEXT: s_subb_u32 s1, s13, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_sub_u32 s2, s12, s14 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_subb_u32 s0, s13, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -1731,8 +1751,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_sub_u32 s0, s12, s14 -; GFX1010-NEXT: s_subb_u32 s1, s13, s15 +; GFX1010-NEXT: s_cselect_b32 s1, -1, 0 ; GFX1010-NEXT: v_mov_b32_e32 v0, s0 +; GFX1010-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1010-NEXT: s_subb_u32 s1, s13, s15 ; GFX1010-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -1746,8 +1768,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_sub_u32 s4, s4, s6 -; GFX1030W32-NEXT: s_subb_u32 s5, s5, s7 +; GFX1030W32-NEXT: s_cselect_b32 s6, -1, 0 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4 +; GFX1030W32-NEXT: s_cmp_lg_u32 s6, 0 +; GFX1030W32-NEXT: s_subb_u32 s5, s5, s7 ; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 @@ -1761,8 +1785,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_sub_u32 s4, s4, s6 -; GFX1030W64-NEXT: s_subb_u32 s5, s5, s7 +; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX1030W64-NEXT: s_subb_u32 s5, s5, s7 ; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5 ; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] @@ -1775,8 +1801,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s4, s4, s6 -; GFX11-NEXT: s_subb_u32 s5, s5, s7 +; GFX11-NEXT: s_cselect_b32 s6, -1, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_cmp_lg_u32 s6, 0 +; GFX11-NEXT: s_subb_u32 s5, s5, s7 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 @@ -1790,8 +1818,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_sub_co_u32 s0, s12, s14 -; GFX1250-NEXT: s_sub_co_ci_u32 s1, s13, s15 +; GFX1250-NEXT: s_cselect_b32 s1, -1, 0 ; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1250-NEXT: s_sub_co_ci_u32 s1, s13, s15 ; GFX1250-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -2188,46 +2218,49 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; VI-NEXT: s_addc_u32 s6, s7, s9 ; VI-NEXT: s_addc_u32 s8, s8, 0 ; VI-NEXT: v_readfirstlane_b32 s7, v0 -; VI-NEXT: s_add_u32 s10, s6, s7 -; VI-NEXT: v_mov_b32_e32 v0, s10 +; VI-NEXT: s_add_u32 s12, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s4, v0, 0 -; VI-NEXT: s_addc_u32 s11, 0, s8 -; VI-NEXT: s_mul_i32 s8, s4, s11 +; VI-NEXT: s_addc_u32 s13, 0, s8 +; VI-NEXT: s_mul_i32 s8, s4, s13 ; VI-NEXT: v_readfirstlane_b32 s9, v1 ; VI-NEXT: s_add_i32 s8, s9, s8 -; VI-NEXT: s_mul_i32 s9, s5, s10 -; VI-NEXT: s_add_i32 s12, s8, s9 -; VI-NEXT: s_sub_i32 s13, s3, s12 +; VI-NEXT: s_mul_i32 s9, s5, s12 +; VI-NEXT: s_add_i32 s14, s8, s9 +; VI-NEXT: s_sub_i32 s10, s3, s14 ; VI-NEXT: v_readfirstlane_b32 s8, v0 -; VI-NEXT: s_sub_u32 s14, s2, s8 +; VI-NEXT: s_sub_u32 s15, s2, s8 ; VI-NEXT: s_cselect_b64 s[8:9], -1, 0 -; VI-NEXT: s_subb_u32 s13, s13, s5 -; VI-NEXT: s_sub_u32 s15, s14, s4 -; VI-NEXT: s_subb_u32 s13, s13, 0 -; VI-NEXT: s_cmp_ge_u32 s13, s5 +; VI-NEXT: s_cmp_lg_u64 s[8:9], 0 +; VI-NEXT: s_subb_u32 s16, s10, s5 +; VI-NEXT: s_sub_u32 s17, s15, s4 +; VI-NEXT: s_cselect_b64 s[10:11], -1, 0 +; VI-NEXT: s_cmp_lg_u64 s[10:11], 0 +; VI-NEXT: s_subb_u32 s10, s16, 0 +; VI-NEXT: s_cmp_ge_u32 s10, s5 +; VI-NEXT: s_cselect_b32 s11, -1, 0 +; VI-NEXT: s_cmp_ge_u32 s17, s4 ; VI-NEXT: s_cselect_b32 s16, -1, 0 -; VI-NEXT: s_cmp_ge_u32 s15, s4 -; VI-NEXT: s_cselect_b32 s15, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s13, s5 -; VI-NEXT: s_cselect_b32 s13, s15, s16 -; VI-NEXT: s_add_u32 s15, s10, 1 -; VI-NEXT: s_addc_u32 s16, s11, 0 -; VI-NEXT: s_add_u32 s17, s10, 2 -; VI-NEXT: s_addc_u32 s18, s11, 0 -; VI-NEXT: s_cmp_lg_u32 s13, 0 -; VI-NEXT: s_cselect_b32 s13, s17, s15 -; VI-NEXT: s_cselect_b32 s15, s18, s16 +; VI-NEXT: s_cmp_eq_u32 s10, s5 +; VI-NEXT: s_cselect_b32 s10, s16, s11 +; VI-NEXT: s_add_u32 s11, s12, 1 +; VI-NEXT: s_addc_u32 s16, s13, 0 +; VI-NEXT: s_add_u32 s17, s12, 2 +; VI-NEXT: s_addc_u32 s18, s13, 0 +; VI-NEXT: s_cmp_lg_u32 s10, 0 +; VI-NEXT: s_cselect_b32 s10, s17, s11 +; VI-NEXT: s_cselect_b32 s11, s18, s16 ; VI-NEXT: s_cmp_lg_u64 s[8:9], 0 -; VI-NEXT: s_subb_u32 s3, s3, s12 +; VI-NEXT: s_subb_u32 s3, s3, s14 ; VI-NEXT: s_cmp_ge_u32 s3, s5 ; VI-NEXT: s_cselect_b32 s8, -1, 0 -; VI-NEXT: s_cmp_ge_u32 s14, s4 +; VI-NEXT: s_cmp_ge_u32 s15, s4 ; VI-NEXT: s_cselect_b32 s9, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s3, s5 ; VI-NEXT: s_cselect_b32 s3, s9, s8 ; VI-NEXT: s_cmp_lg_u32 s3, 0 -; VI-NEXT: s_cselect_b32 s9, s15, s11 -; VI-NEXT: s_cselect_b32 s8, s13, s10 +; VI-NEXT: s_cselect_b32 s9, s11, s13 +; VI-NEXT: s_cselect_b32 s8, s10, s12 ; VI-NEXT: s_cbranch_execnz .LBB16_4 ; VI-NEXT: .LBB16_2: ; VI-NEXT: v_cvt_f32_u32_e32 v0, s4 @@ -2278,8 +2311,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_u32 s8, 0, s6 -; GFX9-NEXT: s_subb_u32 s9, 0, s7 +; GFX9-NEXT: s_sub_u32 s10, 0, s6 +; GFX9-NEXT: s_subb_u32 s11, 0, s7 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2288,102 +2321,109 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s10, v1 -; GFX9-NEXT: v_readfirstlane_b32 s11, v0 -; GFX9-NEXT: s_mul_i32 s12, s8, s10 -; GFX9-NEXT: s_mul_hi_u32 s14, s8, s11 -; GFX9-NEXT: s_mul_i32 s13, s9, s11 -; GFX9-NEXT: s_add_i32 s12, s14, s12 -; GFX9-NEXT: s_add_i32 s12, s12, s13 -; GFX9-NEXT: s_mul_i32 s15, s8, s11 -; GFX9-NEXT: s_mul_i32 s14, s11, s12 -; GFX9-NEXT: s_mul_hi_u32 s16, s11, s15 -; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12 +; GFX9-NEXT: v_readfirstlane_b32 s12, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: s_mul_i32 s9, s10, s12 +; GFX9-NEXT: s_mul_hi_u32 s14, s10, s8 +; GFX9-NEXT: s_mul_i32 s13, s11, s8 +; GFX9-NEXT: s_add_i32 s9, s14, s9 +; GFX9-NEXT: s_add_i32 s9, s9, s13 +; GFX9-NEXT: s_mul_i32 s15, s10, s8 +; GFX9-NEXT: s_mul_i32 s14, s8, s9 +; GFX9-NEXT: s_mul_hi_u32 s16, s8, s15 +; GFX9-NEXT: s_mul_hi_u32 s13, s8, s9 ; GFX9-NEXT: s_add_u32 s14, s16, s14 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_mul_hi_u32 s17, s10, s15 -; GFX9-NEXT: s_mul_i32 s15, s10, s15 +; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15 +; GFX9-NEXT: s_mul_i32 s15, s12, s15 ; GFX9-NEXT: s_add_u32 s14, s14, s15 -; GFX9-NEXT: s_mul_hi_u32 s16, s10, s12 +; GFX9-NEXT: s_mul_hi_u32 s16, s12, s9 ; GFX9-NEXT: s_addc_u32 s13, s13, s17 ; GFX9-NEXT: s_addc_u32 s14, s16, 0 -; GFX9-NEXT: s_mul_i32 s12, s10, s12 -; GFX9-NEXT: s_add_u32 s12, s13, s12 +; GFX9-NEXT: s_mul_i32 s9, s12, s9 +; GFX9-NEXT: s_add_u32 s9, s13, s9 ; GFX9-NEXT: s_addc_u32 s13, 0, s14 -; GFX9-NEXT: s_add_u32 s11, s11, s12 -; GFX9-NEXT: s_addc_u32 s10, s10, s13 -; GFX9-NEXT: s_mul_i32 s12, s8, s10 -; GFX9-NEXT: s_mul_hi_u32 s13, s8, s11 -; GFX9-NEXT: s_add_i32 s12, s13, s12 -; GFX9-NEXT: s_mul_i32 s9, s9, s11 -; GFX9-NEXT: s_add_i32 s12, s12, s9 -; GFX9-NEXT: s_mul_i32 s8, s8, s11 -; GFX9-NEXT: s_mul_hi_u32 s13, s10, s8 -; GFX9-NEXT: s_mul_i32 s14, s10, s8 -; GFX9-NEXT: s_mul_i32 s16, s11, s12 -; GFX9-NEXT: s_mul_hi_u32 s8, s11, s8 -; GFX9-NEXT: s_mul_hi_u32 s15, s11, s12 -; GFX9-NEXT: s_add_u32 s8, s8, s16 +; GFX9-NEXT: s_add_u32 s14, s8, s9 +; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX9-NEXT: s_addc_u32 s12, s12, s13 +; GFX9-NEXT: s_mul_i32 s8, s10, s12 +; GFX9-NEXT: s_mul_hi_u32 s9, s10, s14 +; GFX9-NEXT: s_add_i32 s8, s9, s8 +; GFX9-NEXT: s_mul_i32 s11, s11, s14 +; GFX9-NEXT: s_add_i32 s8, s8, s11 +; GFX9-NEXT: s_mul_i32 s10, s10, s14 +; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10 +; GFX9-NEXT: s_mul_i32 s13, s12, s10 +; GFX9-NEXT: s_mul_i32 s16, s14, s8 +; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10 +; GFX9-NEXT: s_mul_hi_u32 s15, s14, s8 +; GFX9-NEXT: s_add_u32 s10, s10, s16 ; GFX9-NEXT: s_addc_u32 s15, 0, s15 -; GFX9-NEXT: s_add_u32 s8, s8, s14 -; GFX9-NEXT: s_mul_hi_u32 s9, s10, s12 -; GFX9-NEXT: s_addc_u32 s8, s15, s13 +; GFX9-NEXT: s_add_u32 s10, s10, s13 +; GFX9-NEXT: s_mul_hi_u32 s9, s12, s8 +; GFX9-NEXT: s_addc_u32 s10, s15, s11 ; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: s_mul_i32 s12, s10, s12 -; GFX9-NEXT: s_add_u32 s8, s8, s12 +; GFX9-NEXT: s_mul_i32 s8, s12, s8 +; GFX9-NEXT: s_add_u32 s8, s10, s8 +; GFX9-NEXT: s_addc_u32 s10, 0, s9 +; GFX9-NEXT: s_add_u32 s11, s14, s8 +; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX9-NEXT: s_addc_u32 s8, s12, s10 +; GFX9-NEXT: s_mul_i32 s10, s2, s8 +; GFX9-NEXT: s_mul_hi_u32 s12, s2, s11 +; GFX9-NEXT: s_mul_hi_u32 s9, s2, s8 +; GFX9-NEXT: s_add_u32 s10, s12, s10 ; GFX9-NEXT: s_addc_u32 s9, 0, s9 -; GFX9-NEXT: s_add_u32 s8, s11, s8 -; GFX9-NEXT: s_addc_u32 s9, s10, s9 -; GFX9-NEXT: s_mul_i32 s11, s2, s9 -; GFX9-NEXT: s_mul_hi_u32 s12, s2, s8 -; GFX9-NEXT: s_mul_hi_u32 s10, s2, s9 -; GFX9-NEXT: s_add_u32 s11, s12, s11 -; GFX9-NEXT: s_addc_u32 s10, 0, s10 -; GFX9-NEXT: s_mul_hi_u32 s13, s3, s8 -; GFX9-NEXT: s_mul_i32 s8, s3, s8 -; GFX9-NEXT: s_add_u32 s8, s11, s8 -; GFX9-NEXT: s_mul_hi_u32 s12, s3, s9 -; GFX9-NEXT: s_addc_u32 s8, s10, s13 +; GFX9-NEXT: s_mul_hi_u32 s13, s3, s11 +; GFX9-NEXT: s_mul_i32 s11, s3, s11 +; GFX9-NEXT: s_add_u32 s10, s10, s11 +; GFX9-NEXT: s_mul_hi_u32 s12, s3, s8 +; GFX9-NEXT: s_addc_u32 s9, s9, s13 ; GFX9-NEXT: s_addc_u32 s10, s12, 0 -; GFX9-NEXT: s_mul_i32 s9, s3, s9 -; GFX9-NEXT: s_add_u32 s11, s8, s9 -; GFX9-NEXT: s_addc_u32 s10, 0, s10 -; GFX9-NEXT: s_mul_i32 s8, s6, s10 -; GFX9-NEXT: s_mul_hi_u32 s9, s6, s11 +; GFX9-NEXT: s_mul_i32 s8, s3, s8 +; GFX9-NEXT: s_add_u32 s12, s9, s8 +; GFX9-NEXT: s_addc_u32 s13, 0, s10 +; GFX9-NEXT: s_mul_i32 s8, s6, s13 +; GFX9-NEXT: s_mul_hi_u32 s9, s6, s12 ; GFX9-NEXT: s_add_i32 s8, s9, s8 -; GFX9-NEXT: s_mul_i32 s9, s7, s11 -; GFX9-NEXT: s_add_i32 s12, s8, s9 -; GFX9-NEXT: s_sub_i32 s13, s3, s12 -; GFX9-NEXT: s_mul_i32 s8, s6, s11 -; GFX9-NEXT: s_sub_u32 s14, s2, s8 +; GFX9-NEXT: s_mul_i32 s9, s7, s12 +; GFX9-NEXT: s_add_i32 s14, s8, s9 +; GFX9-NEXT: s_sub_i32 s10, s3, s14 +; GFX9-NEXT: s_mul_i32 s8, s6, s12 +; GFX9-NEXT: s_sub_u32 s15, s2, s8 ; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX9-NEXT: s_subb_u32 s13, s13, s7 -; GFX9-NEXT: s_sub_u32 s15, s14, s6 -; GFX9-NEXT: s_subb_u32 s13, s13, 0 -; GFX9-NEXT: s_cmp_ge_u32 s13, s7 +; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX9-NEXT: s_subb_u32 s16, s10, s7 +; GFX9-NEXT: s_sub_u32 s17, s15, s6 +; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GFX9-NEXT: s_subb_u32 s10, s16, 0 +; GFX9-NEXT: s_cmp_ge_u32 s10, s7 +; GFX9-NEXT: s_cselect_b32 s11, -1, 0 +; GFX9-NEXT: s_cmp_ge_u32 s17, s6 ; GFX9-NEXT: s_cselect_b32 s16, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s15, s6 -; GFX9-NEXT: s_cselect_b32 s15, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s13, s7 -; GFX9-NEXT: s_cselect_b32 s13, s15, s16 -; GFX9-NEXT: s_add_u32 s15, s11, 1 -; GFX9-NEXT: s_addc_u32 s16, s10, 0 -; GFX9-NEXT: s_add_u32 s17, s11, 2 -; GFX9-NEXT: s_addc_u32 s18, s10, 0 -; GFX9-NEXT: s_cmp_lg_u32 s13, 0 -; GFX9-NEXT: s_cselect_b32 s13, s17, s15 -; GFX9-NEXT: s_cselect_b32 s15, s18, s16 +; GFX9-NEXT: s_cmp_eq_u32 s10, s7 +; GFX9-NEXT: s_cselect_b32 s10, s16, s11 +; GFX9-NEXT: s_add_u32 s11, s12, 1 +; GFX9-NEXT: s_addc_u32 s16, s13, 0 +; GFX9-NEXT: s_add_u32 s17, s12, 2 +; GFX9-NEXT: s_addc_u32 s18, s13, 0 +; GFX9-NEXT: s_cmp_lg_u32 s10, 0 +; GFX9-NEXT: s_cselect_b32 s10, s17, s11 +; GFX9-NEXT: s_cselect_b32 s11, s18, s16 ; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 -; GFX9-NEXT: s_subb_u32 s3, s3, s12 +; GFX9-NEXT: s_subb_u32 s3, s3, s14 ; GFX9-NEXT: s_cmp_ge_u32 s3, s7 ; GFX9-NEXT: s_cselect_b32 s8, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s14, s6 +; GFX9-NEXT: s_cmp_ge_u32 s15, s6 ; GFX9-NEXT: s_cselect_b32 s9, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s3, s7 ; GFX9-NEXT: s_cselect_b32 s3, s9, s8 ; GFX9-NEXT: s_cmp_lg_u32 s3, 0 -; GFX9-NEXT: s_cselect_b32 s9, s15, s10 -; GFX9-NEXT: s_cselect_b32 s8, s13, s11 +; GFX9-NEXT: s_cselect_b32 s9, s11, s13 +; GFX9-NEXT: s_cselect_b32 s8, s10, s12 ; GFX9-NEXT: s_cbranch_execnz .LBB16_3 ; GFX9-NEXT: .LBB16_2: ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 @@ -2463,40 +2503,44 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_add_u32 s11, s12, s11 ; GFX1010-NEXT: s_addc_u32 s12, 0, s13 ; GFX1010-NEXT: s_add_u32 s8, s8, s11 +; GFX1010-NEXT: s_cselect_b32 s11, -1, 0 +; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s8 +; GFX1010-NEXT: s_cmp_lg_u32 s11, 0 +; GFX1010-NEXT: s_mul_i32 s11, s9, s8 ; GFX1010-NEXT: s_addc_u32 s5, s5, s12 -; GFX1010-NEXT: s_mul_hi_u32 s11, s9, s8 -; GFX1010-NEXT: s_mul_i32 s12, s9, s8 -; GFX1010-NEXT: s_mul_i32 s9, s9, s5 ; GFX1010-NEXT: s_mul_i32 s10, s10, s8 -; GFX1010-NEXT: s_add_i32 s9, s11, s9 -; GFX1010-NEXT: s_mul_i32 s11, s5, s12 +; GFX1010-NEXT: s_mul_i32 s9, s9, s5 +; GFX1010-NEXT: s_mul_hi_u32 s12, s8, s11 +; GFX1010-NEXT: s_add_i32 s9, s13, s9 +; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s11 ; GFX1010-NEXT: s_add_i32 s9, s9, s10 -; GFX1010-NEXT: s_mul_hi_u32 s10, s8, s12 +; GFX1010-NEXT: s_mul_i32 s10, s5, s11 ; GFX1010-NEXT: s_mul_i32 s15, s8, s9 ; GFX1010-NEXT: s_mul_hi_u32 s14, s8, s9 -; GFX1010-NEXT: s_add_u32 s10, s10, s15 -; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s12 +; GFX1010-NEXT: s_add_u32 s12, s12, s15 ; GFX1010-NEXT: s_addc_u32 s14, 0, s14 -; GFX1010-NEXT: s_mul_hi_u32 s12, s5, s9 -; GFX1010-NEXT: s_add_u32 s10, s10, s11 +; GFX1010-NEXT: s_mul_hi_u32 s11, s5, s9 +; GFX1010-NEXT: s_add_u32 s10, s12, s10 ; GFX1010-NEXT: s_mul_i32 s9, s5, s9 ; GFX1010-NEXT: s_addc_u32 s10, s14, s13 -; GFX1010-NEXT: s_addc_u32 s11, s12, 0 +; GFX1010-NEXT: s_addc_u32 s11, s11, 0 ; GFX1010-NEXT: s_add_u32 s9, s10, s9 ; GFX1010-NEXT: s_addc_u32 s10, 0, s11 ; GFX1010-NEXT: s_add_u32 s8, s8, s9 +; GFX1010-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s8 +; GFX1010-NEXT: s_cmp_lg_u32 s9, 0 +; GFX1010-NEXT: s_mul_hi_u32 s9, s3, s8 ; GFX1010-NEXT: s_addc_u32 s5, s5, s10 -; GFX1010-NEXT: s_mul_hi_u32 s9, s2, s8 -; GFX1010-NEXT: s_mul_i32 s12, s2, s5 -; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s5 -; GFX1010-NEXT: s_mul_hi_u32 s10, s3, s8 ; GFX1010-NEXT: s_mul_i32 s8, s3, s8 -; GFX1010-NEXT: s_add_u32 s9, s9, s12 -; GFX1010-NEXT: s_addc_u32 s11, 0, s11 +; GFX1010-NEXT: s_mul_i32 s12, s2, s5 +; GFX1010-NEXT: s_mul_hi_u32 s10, s2, s5 +; GFX1010-NEXT: s_add_u32 s11, s11, s12 +; GFX1010-NEXT: s_addc_u32 s10, 0, s10 ; GFX1010-NEXT: s_mul_hi_u32 s13, s3, s5 -; GFX1010-NEXT: s_add_u32 s8, s9, s8 +; GFX1010-NEXT: s_add_u32 s8, s11, s8 ; GFX1010-NEXT: s_mul_i32 s5, s3, s5 -; GFX1010-NEXT: s_addc_u32 s8, s11, s10 +; GFX1010-NEXT: s_addc_u32 s8, s10, s9 ; GFX1010-NEXT: s_addc_u32 s9, s13, 0 ; GFX1010-NEXT: s_add_u32 s5, s8, s5 ; GFX1010-NEXT: s_addc_u32 s8, 0, s9 @@ -2509,8 +2553,11 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_sub_i32 s11, s3, s9 ; GFX1010-NEXT: s_sub_u32 s10, s2, s10 ; GFX1010-NEXT: s_cselect_b32 s12, -1, 0 +; GFX1010-NEXT: s_cmp_lg_u32 s12, 0 ; GFX1010-NEXT: s_subb_u32 s11, s11, s7 ; GFX1010-NEXT: s_sub_u32 s13, s10, s6 +; GFX1010-NEXT: s_cselect_b32 s14, -1, 0 +; GFX1010-NEXT: s_cmp_lg_u32 s14, 0 ; GFX1010-NEXT: s_subb_u32 s11, s11, 0 ; GFX1010-NEXT: s_cmp_ge_u32 s11, s7 ; GFX1010-NEXT: s_cselect_b32 s14, -1, 0 @@ -2616,40 +2663,44 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: s_add_u32 s11, s12, s11 ; GFX1030W32-NEXT: s_addc_u32 s12, 0, s13 ; GFX1030W32-NEXT: s_add_u32 s8, s8, s11 +; GFX1030W32-NEXT: s_cselect_b32 s11, -1, 0 +; GFX1030W32-NEXT: s_mul_hi_u32 s13, s9, s8 +; GFX1030W32-NEXT: s_cmp_lg_u32 s11, 0 +; GFX1030W32-NEXT: s_mul_i32 s11, s9, s8 ; GFX1030W32-NEXT: s_addc_u32 s7, s7, s12 -; GFX1030W32-NEXT: s_mul_hi_u32 s11, s9, s8 -; GFX1030W32-NEXT: s_mul_i32 s12, s9, s8 -; GFX1030W32-NEXT: s_mul_i32 s9, s9, s7 ; GFX1030W32-NEXT: s_mul_i32 s10, s10, s8 -; GFX1030W32-NEXT: s_add_i32 s9, s11, s9 -; GFX1030W32-NEXT: s_mul_i32 s11, s7, s12 +; GFX1030W32-NEXT: s_mul_i32 s9, s9, s7 +; GFX1030W32-NEXT: s_mul_hi_u32 s12, s8, s11 +; GFX1030W32-NEXT: s_add_i32 s9, s13, s9 +; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s11 ; GFX1030W32-NEXT: s_add_i32 s9, s9, s10 -; GFX1030W32-NEXT: s_mul_hi_u32 s10, s8, s12 +; GFX1030W32-NEXT: s_mul_i32 s10, s7, s11 ; GFX1030W32-NEXT: s_mul_i32 s15, s8, s9 ; GFX1030W32-NEXT: s_mul_hi_u32 s14, s8, s9 -; GFX1030W32-NEXT: s_add_u32 s10, s10, s15 -; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s12 +; GFX1030W32-NEXT: s_add_u32 s12, s12, s15 ; GFX1030W32-NEXT: s_addc_u32 s14, 0, s14 -; GFX1030W32-NEXT: s_mul_hi_u32 s12, s7, s9 -; GFX1030W32-NEXT: s_add_u32 s10, s10, s11 +; GFX1030W32-NEXT: s_mul_hi_u32 s11, s7, s9 +; GFX1030W32-NEXT: s_add_u32 s10, s12, s10 ; GFX1030W32-NEXT: s_mul_i32 s9, s7, s9 ; GFX1030W32-NEXT: s_addc_u32 s10, s14, s13 -; GFX1030W32-NEXT: s_addc_u32 s11, s12, 0 +; GFX1030W32-NEXT: s_addc_u32 s11, s11, 0 ; GFX1030W32-NEXT: s_add_u32 s9, s10, s9 ; GFX1030W32-NEXT: s_addc_u32 s10, 0, s11 ; GFX1030W32-NEXT: s_add_u32 s8, s8, s9 +; GFX1030W32-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s8 +; GFX1030W32-NEXT: s_cmp_lg_u32 s9, 0 +; GFX1030W32-NEXT: s_mul_hi_u32 s9, s3, s8 ; GFX1030W32-NEXT: s_addc_u32 s7, s7, s10 -; GFX1030W32-NEXT: s_mul_hi_u32 s9, s2, s8 -; GFX1030W32-NEXT: s_mul_i32 s12, s2, s7 -; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s7 -; GFX1030W32-NEXT: s_mul_hi_u32 s10, s3, s8 ; GFX1030W32-NEXT: s_mul_i32 s8, s3, s8 -; GFX1030W32-NEXT: s_add_u32 s9, s9, s12 -; GFX1030W32-NEXT: s_addc_u32 s11, 0, s11 +; GFX1030W32-NEXT: s_mul_i32 s12, s2, s7 +; GFX1030W32-NEXT: s_mul_hi_u32 s10, s2, s7 +; GFX1030W32-NEXT: s_add_u32 s11, s11, s12 +; GFX1030W32-NEXT: s_addc_u32 s10, 0, s10 ; GFX1030W32-NEXT: s_mul_hi_u32 s13, s3, s7 -; GFX1030W32-NEXT: s_add_u32 s8, s9, s8 +; GFX1030W32-NEXT: s_add_u32 s8, s11, s8 ; GFX1030W32-NEXT: s_mul_i32 s7, s3, s7 -; GFX1030W32-NEXT: s_addc_u32 s8, s11, s10 +; GFX1030W32-NEXT: s_addc_u32 s8, s10, s9 ; GFX1030W32-NEXT: s_addc_u32 s9, s13, 0 ; GFX1030W32-NEXT: s_add_u32 s7, s8, s7 ; GFX1030W32-NEXT: s_addc_u32 s8, 0, s9 @@ -2662,8 +2713,11 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: s_sub_i32 s11, s3, s9 ; GFX1030W32-NEXT: s_sub_u32 s10, s2, s10 ; GFX1030W32-NEXT: s_cselect_b32 s12, -1, 0 +; GFX1030W32-NEXT: s_cmp_lg_u32 s12, 0 ; GFX1030W32-NEXT: s_subb_u32 s11, s11, s5 ; GFX1030W32-NEXT: s_sub_u32 s13, s10, s4 +; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0 +; GFX1030W32-NEXT: s_cmp_lg_u32 s14, 0 ; GFX1030W32-NEXT: s_subb_u32 s11, s11, 0 ; GFX1030W32-NEXT: s_cmp_ge_u32 s11, s5 ; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0 @@ -2736,8 +2790,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W64-NEXT: ; %bb.1: ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GFX1030W64-NEXT: s_sub_u32 s8, 0, s4 -; GFX1030W64-NEXT: s_subb_u32 s9, 0, s5 +; GFX1030W64-NEXT: s_sub_u32 s9, 0, s4 +; GFX1030W64-NEXT: s_subb_u32 s10, 0, s5 ; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0 ; GFX1030W64-NEXT: v_rcp_f32_e32 v0, v0 ; GFX1030W64-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2746,102 +2800,109 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0 ; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1030W64-NEXT: v_readfirstlane_b32 s6, v1 -; GFX1030W64-NEXT: v_readfirstlane_b32 s7, v0 -; GFX1030W64-NEXT: s_mul_i32 s10, s8, s6 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s8, s7 -; GFX1030W64-NEXT: s_mul_i32 s11, s9, s7 -; GFX1030W64-NEXT: s_add_i32 s10, s12, s10 -; GFX1030W64-NEXT: s_mul_i32 s13, s8, s7 -; GFX1030W64-NEXT: s_add_i32 s10, s10, s11 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s7, s13 -; GFX1030W64-NEXT: s_mul_i32 s15, s7, s10 -; GFX1030W64-NEXT: s_mul_hi_u32 s14, s6, s13 -; GFX1030W64-NEXT: s_mul_i32 s11, s6, s13 -; GFX1030W64-NEXT: s_mul_hi_u32 s13, s7, s10 +; GFX1030W64-NEXT: v_readfirstlane_b32 s8, v1 +; GFX1030W64-NEXT: v_readfirstlane_b32 s6, v0 +; GFX1030W64-NEXT: s_mul_i32 s7, s9, s8 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s9, s6 +; GFX1030W64-NEXT: s_mul_i32 s11, s10, s6 +; GFX1030W64-NEXT: s_add_i32 s7, s12, s7 +; GFX1030W64-NEXT: s_mul_i32 s13, s9, s6 +; GFX1030W64-NEXT: s_add_i32 s7, s7, s11 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s13 +; GFX1030W64-NEXT: s_mul_i32 s15, s6, s7 +; GFX1030W64-NEXT: s_mul_hi_u32 s14, s8, s13 +; GFX1030W64-NEXT: s_mul_i32 s11, s8, s13 +; GFX1030W64-NEXT: s_mul_hi_u32 s13, s6, s7 ; GFX1030W64-NEXT: s_add_u32 s12, s12, s15 ; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13 -; GFX1030W64-NEXT: s_mul_hi_u32 s16, s6, s10 +; GFX1030W64-NEXT: s_mul_hi_u32 s16, s8, s7 ; GFX1030W64-NEXT: s_add_u32 s11, s12, s11 -; GFX1030W64-NEXT: s_mul_i32 s10, s6, s10 +; GFX1030W64-NEXT: s_mul_i32 s7, s8, s7 ; GFX1030W64-NEXT: s_addc_u32 s11, s13, s14 ; GFX1030W64-NEXT: s_addc_u32 s12, s16, 0 -; GFX1030W64-NEXT: s_add_u32 s10, s11, s10 +; GFX1030W64-NEXT: s_add_u32 s7, s11, s7 ; GFX1030W64-NEXT: s_addc_u32 s11, 0, s12 -; GFX1030W64-NEXT: s_add_u32 s7, s7, s10 -; GFX1030W64-NEXT: s_addc_u32 s6, s6, s11 -; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s7 -; GFX1030W64-NEXT: s_mul_i32 s11, s8, s7 -; GFX1030W64-NEXT: s_mul_i32 s8, s8, s6 -; GFX1030W64-NEXT: s_mul_i32 s9, s9, s7 -; GFX1030W64-NEXT: s_add_i32 s8, s10, s8 -; GFX1030W64-NEXT: s_mul_i32 s10, s6, s11 -; GFX1030W64-NEXT: s_add_i32 s8, s8, s9 -; GFX1030W64-NEXT: s_mul_hi_u32 s9, s7, s11 -; GFX1030W64-NEXT: s_mul_i32 s14, s7, s8 -; GFX1030W64-NEXT: s_mul_hi_u32 s13, s7, s8 -; GFX1030W64-NEXT: s_add_u32 s9, s9, s14 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s11 +; GFX1030W64-NEXT: s_add_u32 s12, s6, s7 +; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX1030W64-NEXT: s_mul_hi_u32 s13, s9, s12 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX1030W64-NEXT: s_mul_i32 s6, s9, s12 +; GFX1030W64-NEXT: s_addc_u32 s8, s8, s11 +; GFX1030W64-NEXT: s_mul_i32 s10, s10, s12 +; GFX1030W64-NEXT: s_mul_i32 s9, s9, s8 +; GFX1030W64-NEXT: s_mul_hi_u32 s7, s12, s6 +; GFX1030W64-NEXT: s_add_i32 s9, s13, s9 +; GFX1030W64-NEXT: s_mul_hi_u32 s11, s8, s6 +; GFX1030W64-NEXT: s_add_i32 s9, s9, s10 +; GFX1030W64-NEXT: s_mul_i32 s6, s8, s6 +; GFX1030W64-NEXT: s_mul_i32 s14, s12, s9 +; GFX1030W64-NEXT: s_mul_hi_u32 s13, s12, s9 +; GFX1030W64-NEXT: s_add_u32 s7, s7, s14 ; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13 -; GFX1030W64-NEXT: s_mul_hi_u32 s11, s6, s8 -; GFX1030W64-NEXT: s_add_u32 s9, s9, s10 -; GFX1030W64-NEXT: s_mul_i32 s8, s6, s8 -; GFX1030W64-NEXT: s_addc_u32 s9, s13, s12 -; GFX1030W64-NEXT: s_addc_u32 s10, s11, 0 -; GFX1030W64-NEXT: s_add_u32 s8, s9, s8 -; GFX1030W64-NEXT: s_addc_u32 s9, 0, s10 -; GFX1030W64-NEXT: s_add_u32 s7, s7, s8 -; GFX1030W64-NEXT: s_addc_u32 s6, s6, s9 -; GFX1030W64-NEXT: s_mul_hi_u32 s8, s2, s7 -; GFX1030W64-NEXT: s_mul_i32 s11, s2, s6 -; GFX1030W64-NEXT: s_mul_hi_u32 s10, s2, s6 -; GFX1030W64-NEXT: s_mul_hi_u32 s9, s3, s7 +; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s9 +; GFX1030W64-NEXT: s_add_u32 s6, s7, s6 +; GFX1030W64-NEXT: s_mul_i32 s9, s8, s9 +; GFX1030W64-NEXT: s_addc_u32 s6, s13, s11 +; GFX1030W64-NEXT: s_addc_u32 s7, s10, 0 +; GFX1030W64-NEXT: s_add_u32 s6, s6, s9 +; GFX1030W64-NEXT: s_addc_u32 s9, 0, s7 +; GFX1030W64-NEXT: s_add_u32 s10, s12, s6 +; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX1030W64-NEXT: s_mul_hi_u32 s11, s2, s10 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX1030W64-NEXT: s_mul_hi_u32 s6, s3, s10 +; GFX1030W64-NEXT: s_addc_u32 s7, s8, s9 +; GFX1030W64-NEXT: s_mul_i32 s8, s3, s10 +; GFX1030W64-NEXT: s_mul_i32 s10, s2, s7 +; GFX1030W64-NEXT: s_mul_hi_u32 s9, s2, s7 +; GFX1030W64-NEXT: s_add_u32 s10, s11, s10 +; GFX1030W64-NEXT: s_addc_u32 s9, 0, s9 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s3, s7 +; GFX1030W64-NEXT: s_add_u32 s8, s10, s8 ; GFX1030W64-NEXT: s_mul_i32 s7, s3, s7 -; GFX1030W64-NEXT: s_add_u32 s8, s8, s11 -; GFX1030W64-NEXT: s_addc_u32 s10, 0, s10 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s3, s6 -; GFX1030W64-NEXT: s_add_u32 s7, s8, s7 -; GFX1030W64-NEXT: s_mul_i32 s6, s3, s6 -; GFX1030W64-NEXT: s_addc_u32 s7, s10, s9 +; GFX1030W64-NEXT: s_addc_u32 s6, s9, s6 ; GFX1030W64-NEXT: s_addc_u32 s8, s12, 0 -; GFX1030W64-NEXT: s_add_u32 s10, s7, s6 +; GFX1030W64-NEXT: s_add_u32 s10, s6, s7 ; GFX1030W64-NEXT: s_addc_u32 s11, 0, s8 ; GFX1030W64-NEXT: s_mul_hi_u32 s6, s4, s10 ; GFX1030W64-NEXT: s_mul_i32 s7, s4, s11 ; GFX1030W64-NEXT: s_mul_i32 s8, s5, s10 ; GFX1030W64-NEXT: s_add_i32 s6, s6, s7 -; GFX1030W64-NEXT: s_add_i32 s8, s6, s8 +; GFX1030W64-NEXT: s_add_i32 s12, s6, s8 ; GFX1030W64-NEXT: s_mul_i32 s6, s4, s10 -; GFX1030W64-NEXT: s_sub_i32 s9, s3, s8 -; GFX1030W64-NEXT: s_sub_u32 s12, s2, s6 +; GFX1030W64-NEXT: s_sub_i32 s8, s3, s12 +; GFX1030W64-NEXT: s_sub_u32 s13, s2, s6 ; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX1030W64-NEXT: s_subb_u32 s9, s9, s5 -; GFX1030W64-NEXT: s_sub_u32 s13, s12, s4 -; GFX1030W64-NEXT: s_subb_u32 s9, s9, 0 -; GFX1030W64-NEXT: s_cmp_ge_u32 s9, s5 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX1030W64-NEXT: s_subb_u32 s14, s8, s5 +; GFX1030W64-NEXT: s_sub_u32 s15, s13, s4 +; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX1030W64-NEXT: s_subb_u32 s8, s14, 0 +; GFX1030W64-NEXT: s_cmp_ge_u32 s8, s5 +; GFX1030W64-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1030W64-NEXT: s_cmp_ge_u32 s15, s4 ; GFX1030W64-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1030W64-NEXT: s_cmp_ge_u32 s13, s4 -; GFX1030W64-NEXT: s_cselect_b32 s13, -1, 0 -; GFX1030W64-NEXT: s_cmp_eq_u32 s9, s5 -; GFX1030W64-NEXT: s_cselect_b32 s9, s13, s14 -; GFX1030W64-NEXT: s_add_u32 s13, s10, 1 +; GFX1030W64-NEXT: s_cmp_eq_u32 s8, s5 +; GFX1030W64-NEXT: s_cselect_b32 s8, s14, s9 +; GFX1030W64-NEXT: s_add_u32 s9, s10, 1 ; GFX1030W64-NEXT: s_addc_u32 s14, s11, 0 ; GFX1030W64-NEXT: s_add_u32 s15, s10, 2 ; GFX1030W64-NEXT: s_addc_u32 s16, s11, 0 -; GFX1030W64-NEXT: s_cmp_lg_u32 s9, 0 -; GFX1030W64-NEXT: s_cselect_b32 s13, s15, s13 +; GFX1030W64-NEXT: s_cmp_lg_u32 s8, 0 +; GFX1030W64-NEXT: s_cselect_b32 s15, s15, s9 ; GFX1030W64-NEXT: s_cselect_b32 s14, s16, s14 ; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1030W64-NEXT: s_subb_u32 s3, s3, s8 +; GFX1030W64-NEXT: s_subb_u32 s3, s3, s12 ; GFX1030W64-NEXT: s_cmp_ge_u32 s3, s5 ; GFX1030W64-NEXT: s_cselect_b32 s6, -1, 0 -; GFX1030W64-NEXT: s_cmp_ge_u32 s12, s4 +; GFX1030W64-NEXT: s_cmp_ge_u32 s13, s4 ; GFX1030W64-NEXT: s_cselect_b32 s7, -1, 0 ; GFX1030W64-NEXT: s_cmp_eq_u32 s3, s5 ; GFX1030W64-NEXT: s_cselect_b32 s3, s7, s6 ; GFX1030W64-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1030W64-NEXT: s_cselect_b32 s7, s14, s11 -; GFX1030W64-NEXT: s_cselect_b32 s6, s13, s10 +; GFX1030W64-NEXT: s_cselect_b32 s6, s15, s10 ; GFX1030W64-NEXT: s_cbranch_execnz .LBB16_3 ; GFX1030W64-NEXT: .LBB16_2: ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4 @@ -2927,40 +2988,44 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_add_u32 s11, s12, s11 ; GFX11-NEXT: s_addc_u32 s12, 0, s13 ; GFX11-NEXT: s_add_u32 s8, s8, s11 +; GFX11-NEXT: s_cselect_b32 s11, -1, 0 +; GFX11-NEXT: s_mul_hi_u32 s13, s9, s8 +; GFX11-NEXT: s_cmp_lg_u32 s11, 0 +; GFX11-NEXT: s_mul_i32 s11, s9, s8 ; GFX11-NEXT: s_addc_u32 s7, s7, s12 -; GFX11-NEXT: s_mul_hi_u32 s11, s9, s8 -; GFX11-NEXT: s_mul_i32 s12, s9, s8 -; GFX11-NEXT: s_mul_i32 s9, s9, s7 ; GFX11-NEXT: s_mul_i32 s10, s10, s8 -; GFX11-NEXT: s_add_i32 s9, s11, s9 -; GFX11-NEXT: s_mul_i32 s11, s7, s12 +; GFX11-NEXT: s_mul_i32 s9, s9, s7 +; GFX11-NEXT: s_mul_hi_u32 s12, s8, s11 +; GFX11-NEXT: s_add_i32 s9, s13, s9 +; GFX11-NEXT: s_mul_hi_u32 s13, s7, s11 ; GFX11-NEXT: s_add_i32 s9, s9, s10 -; GFX11-NEXT: s_mul_hi_u32 s10, s8, s12 +; GFX11-NEXT: s_mul_i32 s10, s7, s11 ; GFX11-NEXT: s_mul_i32 s15, s8, s9 ; GFX11-NEXT: s_mul_hi_u32 s14, s8, s9 -; GFX11-NEXT: s_add_u32 s10, s10, s15 -; GFX11-NEXT: s_mul_hi_u32 s13, s7, s12 +; GFX11-NEXT: s_add_u32 s12, s12, s15 ; GFX11-NEXT: s_addc_u32 s14, 0, s14 -; GFX11-NEXT: s_mul_hi_u32 s12, s7, s9 -; GFX11-NEXT: s_add_u32 s10, s10, s11 +; GFX11-NEXT: s_mul_hi_u32 s11, s7, s9 +; GFX11-NEXT: s_add_u32 s10, s12, s10 ; GFX11-NEXT: s_mul_i32 s9, s7, s9 ; GFX11-NEXT: s_addc_u32 s10, s14, s13 -; GFX11-NEXT: s_addc_u32 s11, s12, 0 +; GFX11-NEXT: s_addc_u32 s11, s11, 0 ; GFX11-NEXT: s_add_u32 s9, s10, s9 ; GFX11-NEXT: s_addc_u32 s10, 0, s11 ; GFX11-NEXT: s_add_u32 s8, s8, s9 +; GFX11-NEXT: s_cselect_b32 s9, -1, 0 +; GFX11-NEXT: s_mul_hi_u32 s11, s2, s8 +; GFX11-NEXT: s_cmp_lg_u32 s9, 0 +; GFX11-NEXT: s_mul_hi_u32 s9, s3, s8 ; GFX11-NEXT: s_addc_u32 s7, s7, s10 -; GFX11-NEXT: s_mul_hi_u32 s9, s2, s8 -; GFX11-NEXT: s_mul_i32 s12, s2, s7 -; GFX11-NEXT: s_mul_hi_u32 s11, s2, s7 -; GFX11-NEXT: s_mul_hi_u32 s10, s3, s8 ; GFX11-NEXT: s_mul_i32 s8, s3, s8 -; GFX11-NEXT: s_add_u32 s9, s9, s12 -; GFX11-NEXT: s_addc_u32 s11, 0, s11 +; GFX11-NEXT: s_mul_i32 s12, s2, s7 +; GFX11-NEXT: s_mul_hi_u32 s10, s2, s7 +; GFX11-NEXT: s_add_u32 s11, s11, s12 +; GFX11-NEXT: s_addc_u32 s10, 0, s10 ; GFX11-NEXT: s_mul_hi_u32 s13, s3, s7 -; GFX11-NEXT: s_add_u32 s8, s9, s8 +; GFX11-NEXT: s_add_u32 s8, s11, s8 ; GFX11-NEXT: s_mul_i32 s7, s3, s7 -; GFX11-NEXT: s_addc_u32 s8, s11, s10 +; GFX11-NEXT: s_addc_u32 s8, s10, s9 ; GFX11-NEXT: s_addc_u32 s9, s13, 0 ; GFX11-NEXT: s_add_u32 s7, s8, s7 ; GFX11-NEXT: s_addc_u32 s8, 0, s9 @@ -2970,14 +3035,17 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_add_i32 s9, s9, s10 ; GFX11-NEXT: s_mul_i32 s10, s4, s7 ; GFX11-NEXT: s_add_i32 s9, s9, s11 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_sub_i32 s11, s3, s9 ; GFX11-NEXT: s_sub_u32 s10, s2, s10 ; GFX11-NEXT: s_cselect_b32 s12, -1, 0 +; GFX11-NEXT: s_cmp_lg_u32 s12, 0 ; GFX11-NEXT: s_subb_u32 s11, s11, s5 ; GFX11-NEXT: s_sub_u32 s13, s10, s4 +; GFX11-NEXT: s_cselect_b32 s14, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_cmp_lg_u32 s14, 0 ; GFX11-NEXT: s_subb_u32 s11, s11, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_cmp_ge_u32 s11, s5 ; GFX11-NEXT: s_cselect_b32 s14, -1, 0 ; GFX11-NEXT: s_cmp_ge_u32 s13, s4 @@ -3050,8 +3118,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_or_b64 s[4:5], s[2:3], s[6:7] -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_b64 s[4:5], s[4:5], 0xffffffff00000000 +; GFX1250-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1250-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1250-NEXT: ; %bb.1: ; GFX1250-NEXT: s_cvt_f32_u32 s4, s6 @@ -3086,9 +3155,12 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[4:5], s[12:13] ; GFX1250-NEXT: s_add_co_u32 s8, s8, s12 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1250-NEXT: s_add_co_ci_u32 s9, s9, s13 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_mul_u64 s[10:11], s[10:11], s[8:9] +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_mul_hi_u32 s13, s8, s11 ; GFX1250-NEXT: s_mul_i32 s12, s8, s11 ; GFX1250-NEXT: s_mul_hi_u32 s4, s8, s10 @@ -3103,17 +3175,19 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_add_nc_u64 s[10:11], s[4:5], s[10:11] ; GFX1250-NEXT: s_add_co_u32 s8, s8, s10 -; GFX1250-NEXT: s_add_co_ci_u32 s10, s9, s11 +; GFX1250-NEXT: s_cselect_b32 s10, -1, 0 ; GFX1250-NEXT: s_mul_hi_u32 s4, s2, s8 -; GFX1250-NEXT: s_mul_hi_u32 s11, s3, s8 -; GFX1250-NEXT: s_mul_i32 s12, s3, s8 +; GFX1250-NEXT: s_cmp_lg_u32 s10, 0 +; GFX1250-NEXT: s_mul_hi_u32 s12, s3, s8 +; GFX1250-NEXT: s_add_co_ci_u32 s10, s9, s11 +; GFX1250-NEXT: s_mul_i32 s11, s3, s8 ; GFX1250-NEXT: s_mul_hi_u32 s9, s2, s10 ; GFX1250-NEXT: s_mul_i32 s8, s2, s10 ; GFX1250-NEXT: s_mul_hi_u32 s13, s3, s10 ; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[8:9] ; GFX1250-NEXT: s_mul_i32 s10, s3, s10 -; GFX1250-NEXT: s_add_co_u32 s4, s8, s12 -; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s11 +; GFX1250-NEXT: s_add_co_u32 s4, s8, s11 +; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s12 ; GFX1250-NEXT: s_add_co_ci_u32 s11, s13, 0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[10:11] @@ -3128,8 +3202,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_cmp_lg_u32 s8, 0 ; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, s7 ; GFX1250-NEXT: s_sub_co_u32 s13, s4, s6 +; GFX1250-NEXT: s_cselect_b32 s14, -1, 0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: s_cmp_lg_u32 s14, 0 ; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, 0 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_cmp_ge_u32 s12, s7 ; GFX1250-NEXT: s_cselect_b32 s14, -1, 0 ; GFX1250-NEXT: s_cmp_ge_u32 s13, s6 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index 07e6a76..4b151b9 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -714,8 +714,9 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; VI-NEXT: s_lshl_b32 s2, s2, 8 ; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: s_lshl_b32 s3, s2, 16 -; VI-NEXT: s_flbit_i32_b32 s3, s3 ; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_flbit_i32_b32 s3, s3 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, s3, 32 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll index fca57be..cefcbdd 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -1491,6 +1491,7 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace( ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: s_cbranch_scc0 .LBB14_4 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_mov_b32 s11, 0xf000 @@ -1520,6 +1521,7 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace( ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s6, 16 +; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: s_cbranch_scc0 .LBB14_4 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: s_mov_b32 s11, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll index dbdea8e..d8a5e7fa 100644 --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -14,6 +14,7 @@ define i32 @s_add_co_select_user() { ; GFX7-NEXT: s_add_u32 s7, s6, s6 ; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX7-NEXT: s_or_b32 s4, s4, s5 +; GFX7-NEXT: s_cmp_lg_u32 s4, 0 ; GFX7-NEXT: s_addc_u32 s8, s6, 0 ; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec @@ -30,6 +31,8 @@ define i32 @s_add_co_select_user() { ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s7, s6, s6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9-NEXT: s_addc_u32 s8, s6, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec @@ -46,6 +49,8 @@ define i32 @s_add_co_select_user() { ; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s5, s4, s4 +; GFX10-NEXT: s_cselect_b32 s6, -1, 0 +; GFX10-NEXT: s_cmp_lg_u32 s6, 0 ; GFX10-NEXT: s_addc_u32 s6, s4, 0 ; GFX10-NEXT: s_cselect_b32 s7, -1, 0 ; GFX10-NEXT: s_and_b32 s7, s7, exec_lo @@ -62,13 +67,16 @@ define i32 @s_add_co_select_user() { ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s1, s0, s0 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_addc_u32 s2, s0, 0 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s3, s3, exec_lo ; GFX11-NEXT: s_cselect_b32 s2, s2, 0 ; GFX11-NEXT: s_cmp_gt_u32 s0, 31 ; GFX11-NEXT: s_cselect_b32 s0, s1, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] bb: @@ -96,6 +104,7 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX7-NEXT: s_add_u32 s0, s2, s2 ; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_cmp_lg_u32 s0, 0 ; GFX7-NEXT: s_addc_u32 s0, s2, 0 ; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX7-NEXT: s_andn2_b64 vcc, exec, s[0:1] @@ -116,10 +125,12 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; ; GFX9-LABEL: s_add_co_br_user: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s1, s0, s0 -; GFX9-NEXT: s_addc_u32 s0, s0, 0 +; GFX9-NEXT: s_add_u32 s0, s2, s2 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_addc_u32 s0, s2, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GFX9-NEXT: s_cbranch_vccnz .LBB1_2 @@ -142,6 +153,8 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s1, s0, s0 +; GFX10-NEXT: s_cselect_b32 s1, -1, 0 +; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: s_addc_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s0, -1, 0 ; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s0 @@ -165,9 +178,11 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s1, s0, s0 +; GFX11-NEXT: s_cselect_b32 s1, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-NEXT: s_addc_u32 s0, s0, 0 ; GFX11-NEXT: s_cselect_b32 s0, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_vccnz .LBB1_2 ; GFX11-NEXT: ; %bb.1: ; %bb0 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index 9a17538..62847b1 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -1117,6 +1117,7 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; SI: ; %bb.0: ; SI-NEXT: s_and_b32 s3, s1, 0x1ff ; SI-NEXT: s_or_b32 s0, s3, s0 +; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; SI-NEXT: s_lshr_b32 s0, s1, 8 @@ -1168,6 +1169,7 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; VI: ; %bb.0: ; VI-NEXT: s_and_b32 s3, s1, 0x1ff ; VI-NEXT: s_or_b32 s0, s3, s0 +; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; VI-NEXT: s_lshr_b32 s0, s1, 8 @@ -1215,6 +1217,7 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s3, s1, 0x1ff ; GFX9-NEXT: s_or_b32 s0, s3, s0 +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX9-NEXT: s_lshr_b32 s0, s1, 8 @@ -1261,9 +1264,11 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_and_b32 s3, s1, 0x1ff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_or_b32 s0, s3, s0 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-TRUE16-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX11-TRUE16-NEXT: s_bfe_u32 s0, s1, 0xb0014 ; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 8 @@ -1315,9 +1320,11 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_and_b32 s3, s1, 0x1ff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_or_b32 s0, s3, s0 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-FAKE16-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX11-FAKE16-NEXT: s_bfe_u32 s0, s1, 0xb0014 ; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 8 @@ -4016,6 +4023,7 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; SI-NEXT: s_and_b32 s6, s4, 0xffe ; SI-NEXT: s_and_b32 s4, s1, 0x1ff ; SI-NEXT: s_or_b32 s0, s4, s0 +; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s5 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] @@ -4058,6 +4066,7 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; SI-NEXT: s_and_b32 s5, s0, 0xffe ; SI-NEXT: s_and_b32 s0, s3, 0x1ff ; SI-NEXT: s_or_b32 s0, s0, s2 +; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; SI-NEXT: v_readfirstlane_b32 s0, v2 @@ -4111,9 +4120,10 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; VI-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_lshr_b32 s5, s3, 8 -; VI-NEXT: s_and_b32 s5, s5, 0xffe ; VI-NEXT: s_and_b32 s6, s3, 0x1ff +; VI-NEXT: s_and_b32 s5, s5, 0xffe ; VI-NEXT: s_or_b32 s2, s6, s2 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7] ; VI-NEXT: s_bfe_u32 s3, s3, 0xb0014 @@ -4153,6 +4163,7 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; VI-NEXT: s_and_b32 s7, s2, 0xffe ; VI-NEXT: s_and_b32 s2, s1, 0x1ff ; VI-NEXT: s_or_b32 s0, s2, s0 +; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; VI-NEXT: s_bfe_u32 s1, s1, 0xb0014 @@ -4198,9 +4209,10 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; GFX9-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s5, s3, 8 -; GFX9-NEXT: s_and_b32 s5, s5, 0xffe ; GFX9-NEXT: s_and_b32 s6, s3, 0x1ff +; GFX9-NEXT: s_and_b32 s5, s5, 0xffe ; GFX9-NEXT: s_or_b32 s2, s6, s2 +; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7] ; GFX9-NEXT: s_bfe_u32 s6, s3, 0xb0014 @@ -4242,6 +4254,7 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; GFX9-NEXT: s_and_b32 s6, s2, 0xffe ; GFX9-NEXT: s_and_b32 s2, s1, 0x1ff ; GFX9-NEXT: s_or_b32 s0, s2, s0 +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 @@ -4288,10 +4301,11 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; ; GFX11-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_lshr_b32 s5, s3, 8 -; GFX11-NEXT: s_and_b32 s6, s3, 0x1ff -; GFX11-NEXT: s_and_b32 s5, s5, 0xffe -; GFX11-NEXT: s_or_b32 s2, s6, s2 +; GFX11-NEXT: s_and_b32 s5, s3, 0x1ff +; GFX11-NEXT: s_lshr_b32 s6, s3, 8 +; GFX11-NEXT: s_or_b32 s2, s5, s2 +; GFX11-NEXT: s_and_b32 s5, s6, 0xffe +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 @@ -4334,12 +4348,13 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x40f ; GFX11-NEXT: s_cselect_b32 s2, s5, s6 ; GFX11-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-NEXT: s_and_b32 s6, s1, 0x1ff ; GFX11-NEXT: s_lshr_b32 s5, s1, 8 ; GFX11-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX11-NEXT: s_and_b32 s6, s1, 0x1ff +; GFX11-NEXT: s_or_b32 s0, s6, s0 ; GFX11-NEXT: s_and_b32 s5, s5, 0xffe ; GFX11-NEXT: s_or_b32 s2, s3, s2 -; GFX11-NEXT: s_or_b32 s0, s6, s0 +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-NEXT: s_cselect_b32 s0, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll index c28b25c7..b0dd187 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -599,8 +599,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10 ; SI-GISEL-NEXT: s_and_b32 s6, s6, 0xffe ; SI-GISEL-NEXT: s_or_b32 s4, s7, s4 +; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; SI-GISEL-NEXT: s_or_b32 s4, s6, s4 +; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0 ; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9 ; SI-GISEL-NEXT: s_lshl_b32 s7, s3, 12 @@ -709,8 +711,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe ; VI-GISEL-NEXT: s_or_b32 s2, s6, s2 +; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; VI-GISEL-NEXT: s_or_b32 s2, s5, s2 +; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4 ; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12 @@ -820,8 +824,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX9-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xffe ; GFX9-GISEL-NEXT: s_or_b32 s2, s6, s2 +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; GFX9-GISEL-NEXT: s_or_b32 s2, s5, s2 +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX9-GISEL-NEXT: s_sub_i32 s7, 1, s4 ; GFX9-GISEL-NEXT: s_lshl_b32 s6, s4, 12 @@ -931,8 +937,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX950-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; GFX950-GISEL-NEXT: s_and_b32 s5, s5, 0xffe ; GFX950-GISEL-NEXT: s_or_b32 s2, s6, s2 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; GFX950-GISEL-NEXT: s_or_b32 s2, s5, s2 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX950-GISEL-NEXT: s_sub_i32 s7, 1, s4 ; GFX950-GISEL-NEXT: s_lshl_b32 s6, s4, 12 @@ -1110,15 +1118,17 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s3, 8 -; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2 ; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10 ; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe -; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 1, 0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s5, s2 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4 ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s8, s2, 0x1000 @@ -1165,15 +1175,17 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s3, 8 -; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2 ; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10 ; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe -; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 1, 0 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s5, s2 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4 ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s8, s2, 0x1000 @@ -1354,15 +1366,17 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s3, 8 -; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff +; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2 ; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s4, 0xfc10 ; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe -; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2 +; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 1, 0 -; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s5, s2 +; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s6, 1, s4 ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s8, s2, 0x1000 @@ -1409,15 +1423,17 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s3, 8 -; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff +; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2 ; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s4, 0xfc10 ; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe -; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2 +; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 1, 0 -; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s5, s2 +; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s6, 1, s4 ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s8, s2, 0x1000 @@ -2138,8 +2154,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10 ; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe ; SI-GISEL-NEXT: s_or_b32 s4, s9, s4 +; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; SI-GISEL-NEXT: s_or_b32 s4, s8, s4 +; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0 ; SI-GISEL-NEXT: s_lshl_b32 s8, s8, 9 ; SI-GISEL-NEXT: s_lshl_b32 s9, s3, 12 @@ -2175,10 +2193,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; SI-GISEL-NEXT: s_and_b32 s4, s4, 0x8000 ; SI-GISEL-NEXT: s_addk_i32 s5, 0xfc10 ; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe +; SI-GISEL-NEXT: s_or_b32 s6, s9, s6 ; SI-GISEL-NEXT: s_or_b32 s3, s4, s3 -; SI-GISEL-NEXT: s_or_b32 s4, s9, s6 +; SI-GISEL-NEXT: s_cmp_lg_u32 s6, 0 ; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; SI-GISEL-NEXT: s_or_b32 s4, s8, s4 +; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0 ; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9 ; SI-GISEL-NEXT: s_lshl_b32 s8, s5, 12 @@ -2335,8 +2355,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; VI-GISEL-NEXT: s_addk_i32 s2, 0xfc10 ; VI-GISEL-NEXT: s_and_b32 s3, s3, 0xffe ; VI-GISEL-NEXT: s_or_b32 s4, s8, s4 +; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; VI-GISEL-NEXT: s_or_b32 s3, s3, s4 +; VI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; VI-GISEL-NEXT: s_sub_i32 s9, 1, s2 ; VI-GISEL-NEXT: s_lshl_b32 s8, s2, 12 @@ -2370,12 +2392,14 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; VI-GISEL-NEXT: s_or_b32 s2, s3, s2 ; VI-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014 ; VI-GISEL-NEXT: s_lshr_b32 s4, s7, 8 +; VI-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; VI-GISEL-NEXT: s_addk_i32 s3, 0xfc10 ; VI-GISEL-NEXT: s_and_b32 s4, s4, 0xffe -; VI-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; VI-GISEL-NEXT: s_or_b32 s5, s5, s6 +; VI-GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; VI-GISEL-NEXT: s_or_b32 s4, s4, s5 +; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; VI-GISEL-NEXT: s_sub_i32 s8, 1, s3 ; VI-GISEL-NEXT: s_lshl_b32 s6, s3, 12 @@ -2531,8 +2555,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX9-GISEL-NEXT: s_addk_i32 s2, 0xfc10 ; GFX9-GISEL-NEXT: s_and_b32 s3, s3, 0xffe ; GFX9-GISEL-NEXT: s_or_b32 s4, s8, s4 +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GFX9-GISEL-NEXT: s_or_b32 s3, s3, s4 +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GFX9-GISEL-NEXT: s_sub_i32 s9, 1, s2 ; GFX9-GISEL-NEXT: s_lshl_b32 s8, s2, 12 @@ -2566,12 +2592,14 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX9-GISEL-NEXT: s_or_b32 s2, s3, s2 ; GFX9-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014 ; GFX9-GISEL-NEXT: s_lshr_b32 s4, s7, 8 +; GFX9-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; GFX9-GISEL-NEXT: s_addk_i32 s3, 0xfc10 ; GFX9-GISEL-NEXT: s_and_b32 s4, s4, 0xffe -; GFX9-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; GFX9-GISEL-NEXT: s_or_b32 s5, s5, s6 +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX9-GISEL-NEXT: s_or_b32 s4, s4, s5 +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX9-GISEL-NEXT: s_sub_i32 s8, 1, s3 ; GFX9-GISEL-NEXT: s_lshl_b32 s6, s3, 12 @@ -2724,8 +2752,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX950-GISEL-NEXT: s_addk_i32 s2, 0xfc10 ; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0xffe ; GFX950-GISEL-NEXT: s_or_b32 s4, s8, s4 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GFX950-GISEL-NEXT: s_or_b32 s3, s3, s4 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GFX950-GISEL-NEXT: s_sub_i32 s9, 1, s2 ; GFX950-GISEL-NEXT: s_lshl_b32 s8, s2, 12 @@ -2759,12 +2789,14 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX950-GISEL-NEXT: s_or_b32 s2, s3, s2 ; GFX950-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014 ; GFX950-GISEL-NEXT: s_lshr_b32 s4, s7, 8 +; GFX950-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; GFX950-GISEL-NEXT: s_addk_i32 s3, 0xfc10 ; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0xffe -; GFX950-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; GFX950-GISEL-NEXT: s_or_b32 s5, s5, s6 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX950-GISEL-NEXT: s_or_b32 s4, s4, s5 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX950-GISEL-NEXT: s_sub_i32 s8, 1, s3 ; GFX950-GISEL-NEXT: s_lshl_b32 s6, s3, 12 @@ -3041,15 +3073,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s2, s5, 0xb0014 ; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 8 -; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4 ; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s2, 0xfc10 ; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0xffe -; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s4, 0 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 ; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s8, 1, s2 ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s10, s3, 0x1000 @@ -3081,17 +3115,19 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-GISEL-TRUE16-NEXT: s_cmpk_eq_i32 s2, 0x40f ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s4, s3 ; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 16 +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s7, 0xb0014 ; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s7, 8 ; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6 ; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10 ; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2 -; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s8, s6 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, 0 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 1, 0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s5, s3 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4 ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s9, s3, 0x1000 @@ -3140,15 +3176,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s2, s5, 0xb0014 ; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 8 -; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4 ; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s2, 0xfc10 ; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0xffe -; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s4, 0 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 ; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s8, 1, s2 ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s10, s3, 0x1000 @@ -3180,17 +3218,19 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-GISEL-FAKE16-NEXT: s_cmpk_eq_i32 s2, 0x40f ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s4, s3 ; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 16 +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s7, 0xb0014 ; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s7, 8 ; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6 ; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10 ; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2 -; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s8, s6 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, 0 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 1, 0 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s5, s3 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4 ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s9, s3, 0x1000 @@ -3471,15 +3511,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s2, s5, 0xb0014 ; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 8 -; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff +; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4 ; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s2, 0xfc10 ; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0xffe -; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4 +; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 -; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s8, 1, s2 ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s10, s3, 0x1000 @@ -3511,17 +3553,19 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX1250-GISEL-TRUE16-NEXT: s_cmp_eq_u32 s2, 0x40f ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s4, s3 ; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 16 +; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s7, 0xb0014 ; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s7, 8 ; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff +; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6 ; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s4, 0xfc10 ; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2 -; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s8, s6 +; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 1, 0 -; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s5, s3 +; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s6, 1, s4 ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s9, s3, 0x1000 @@ -3570,15 +3614,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s2, s5, 0xb0014 ; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 8 -; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff +; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4 ; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s2, 0xfc10 ; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0xffe -; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4 +; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 -; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s8, 1, s2 ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s10, s3, 0x1000 @@ -3610,17 +3656,19 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX1250-GISEL-FAKE16-NEXT: s_cmp_eq_u32 s2, 0x40f ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s4, s3 ; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 16 +; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s7, 0xb0014 ; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s7, 8 ; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff +; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6 ; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s4, 0xfc10 ; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2 -; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s8, s6 +; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 1, 0 -; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s5, s3 +; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s6, 1, s4 ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s9, s3, 0x1000 diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll index 0deef8b..5d31177 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll @@ -182,6 +182,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; SI-NEXT: s_and_b32 s1, s7, 0x1ff ; SI-NEXT: s_and_b32 s8, s0, 0xffe ; SI-NEXT: s_or_b32 s0, s1, s6 +; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014 @@ -236,6 +237,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; VI-SDAG-NEXT: s_and_b32 s8, s4, 0xffe ; VI-SDAG-NEXT: s_and_b32 s4, s7, 0x1ff ; VI-SDAG-NEXT: s_or_b32 s4, s4, s6 +; VI-SDAG-NEXT: s_cmp_lg_u32 s4, 0 ; VI-SDAG-NEXT: s_mov_b32 s1, s5 ; VI-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0 ; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] @@ -288,8 +290,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe ; VI-GISEL-NEXT: s_or_b32 s2, s6, s2 +; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; VI-GISEL-NEXT: s_or_b32 s2, s5, s2 +; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4 ; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12 @@ -331,10 +335,11 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: s_lshr_b32 s4, s3, 8 -; GFX10-SDAG-NEXT: s_and_b32 s5, s3, 0x1ff -; GFX10-SDAG-NEXT: s_and_b32 s4, s4, 0xffe -; GFX10-SDAG-NEXT: s_or_b32 s2, s5, s2 +; GFX10-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff +; GFX10-SDAG-NEXT: s_lshr_b32 s5, s3, 8 +; GFX10-SDAG-NEXT: s_or_b32 s2, s4, s2 +; GFX10-SDAG-NEXT: s_and_b32 s4, s5, 0xffe +; GFX10-SDAG-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX10-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014 @@ -382,14 +387,16 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX10-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX10-GISEL-NEXT: s_lshr_b32 s5, s3, 8 -; GFX10-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff +; GFX10-GISEL-NEXT: s_or_b32 s2, s6, s2 ; GFX10-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; GFX10-GISEL-NEXT: s_and_b32 s5, s5, 0xffe -; GFX10-GISEL-NEXT: s_or_b32 s2, s6, s2 +; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; GFX10-GISEL-NEXT: s_or_b32 s2, s5, s2 +; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX10-GISEL-NEXT: s_sub_i32 s6, 1, s4 ; GFX10-GISEL-NEXT: s_or_b32 s8, s2, 0x1000 @@ -431,10 +438,11 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_lshr_b32 s4, s3, 8 -; GFX11-SDAG-NEXT: s_and_b32 s5, s3, 0x1ff -; GFX11-SDAG-NEXT: s_and_b32 s4, s4, 0xffe -; GFX11-SDAG-NEXT: s_or_b32 s2, s5, s2 +; GFX11-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff +; GFX11-SDAG-NEXT: s_lshr_b32 s5, s3, 8 +; GFX11-SDAG-NEXT: s_or_b32 s2, s4, s2 +; GFX11-SDAG-NEXT: s_and_b32 s4, s5, 0xffe +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 @@ -490,15 +498,17 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX11-GISEL-NEXT: s_lshr_b32 s5, s3, 8 -; GFX11-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff +; GFX11-GISEL-NEXT: s_or_b32 s2, s6, s2 ; GFX11-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; GFX11-GISEL-NEXT: s_and_b32 s5, s5, 0xffe -; GFX11-GISEL-NEXT: s_or_b32 s2, s6, s2 +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-GISEL-NEXT: s_cselect_b32 s2, 1, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_or_b32 s2, s5, s2 +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-GISEL-NEXT: s_sub_i32 s6, 1, s4 ; GFX11-GISEL-NEXT: s_or_b32 s8, s2, 0x1000 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 31f277f..37756d1 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -472,6 +472,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -535,10 +536,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -604,6 +606,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -657,11 +660,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -706,8 +710,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -1685,6 +1690,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -1748,10 +1754,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1817,6 +1824,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -1870,11 +1878,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1919,8 +1928,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -2958,6 +2968,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -3021,10 +3032,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3090,6 +3102,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -3143,11 +3156,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3192,8 +3206,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -3727,6 +3742,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -3790,10 +3806,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3859,6 +3876,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -3912,11 +3930,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3961,8 +3980,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -4999,6 +5019,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -5062,10 +5083,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5131,6 +5153,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -5184,11 +5207,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5246,8 +5270,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -6259,6 +6284,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6328,6 +6354,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6397,6 +6424,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6457,6 +6485,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6520,6 +6550,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7686,6 +7717,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7755,6 +7787,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7824,6 +7857,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7884,6 +7918,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7947,6 +7983,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9113,6 +9150,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9182,6 +9220,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9251,6 +9290,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9311,6 +9351,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9374,6 +9416,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10022,6 +10065,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10091,6 +10135,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10160,6 +10205,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10220,6 +10266,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10283,6 +10331,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11449,6 +11498,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11518,6 +11568,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11587,6 +11638,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11647,6 +11699,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11710,6 +11764,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 4581efc..6351bb3 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -381,12 +381,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-NEXT: .LBB1_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s3, v0, s2 -; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_max_f32_e32 v2, v2, v1 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_max_f32_e32 v2, v1, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -456,6 +457,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -511,6 +513,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 @@ -559,7 +562,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -606,9 +610,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -1414,12 +1420,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-NEXT: .LBB3_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s3, v0, s2 -; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_max_f32_e32 v2, v2, v1 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_max_f32_e32 v2, v1, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1489,6 +1496,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -1544,6 +1552,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 @@ -1592,7 +1601,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -1639,9 +1649,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -2447,12 +2459,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-NEXT: .LBB5_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s3, v0, s2 -; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_max_f32_e32 v2, v2, v1 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_max_f32_e32 v2, v1, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2522,6 +2535,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -2577,6 +2591,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 @@ -2625,7 +2640,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -2672,9 +2688,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -3573,6 +3591,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -3646,6 +3665,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -3704,6 +3724,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -3753,7 +3774,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -3819,9 +3841,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -4836,6 +4859,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -4909,6 +4933,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -4967,6 +4992,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -5016,7 +5042,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -5082,9 +5109,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -6099,6 +6127,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -6172,6 +6201,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -6230,6 +6260,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -6279,7 +6310,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -6345,9 +6377,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index bd570d9..a9ac008 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -381,12 +381,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-NEXT: .LBB1_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s3, v0, s2 -; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_min_f32_e32 v2, v2, v1 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_min_f32_e32 v2, v1, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -456,6 +457,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -511,6 +513,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 @@ -559,7 +562,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -606,9 +610,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -1414,12 +1420,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-NEXT: .LBB3_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s3, v0, s2 -; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_min_f32_e32 v2, v2, v1 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_min_f32_e32 v2, v1, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1489,6 +1496,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -1544,6 +1552,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 @@ -1592,7 +1601,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -1639,9 +1649,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -2447,12 +2459,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-NEXT: .LBB5_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s3, v0, s2 -; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_min_f32_e32 v2, v2, v1 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_min_f32_e32 v2, v1, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2522,6 +2535,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -2577,6 +2591,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 @@ -2625,7 +2640,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -2672,9 +2688,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -3573,6 +3591,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -3646,6 +3665,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -3704,6 +3724,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -3753,7 +3774,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -3819,9 +3841,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -4836,6 +4859,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -4909,6 +4933,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -4967,6 +4992,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -5016,7 +5042,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -5082,9 +5109,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -6099,6 +6127,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -6172,6 +6201,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -6230,6 +6260,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -6279,7 +6310,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -6345,9 +6377,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index 1f2d70c..6311143 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -532,6 +532,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -595,10 +596,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -664,6 +666,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -717,11 +720,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -779,8 +783,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -1857,6 +1862,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -1920,10 +1926,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1989,6 +1996,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -2042,11 +2050,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2104,8 +2113,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -3182,6 +3192,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -3245,10 +3256,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3314,6 +3326,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -3367,11 +3380,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3429,8 +3443,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -4003,6 +4018,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -4066,10 +4082,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4135,6 +4152,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -4188,11 +4206,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4250,8 +4269,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -5327,6 +5347,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -5390,10 +5411,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5459,6 +5481,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -5512,11 +5535,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5574,8 +5598,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -6587,6 +6612,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6656,6 +6682,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6725,6 +6752,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6785,6 +6813,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6848,6 +6878,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8013,6 +8044,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8082,6 +8114,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8151,6 +8184,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8211,6 +8245,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8274,6 +8310,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9440,6 +9477,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9509,6 +9547,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9578,6 +9617,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9638,6 +9678,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9701,6 +9743,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10349,6 +10392,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10418,6 +10462,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10487,6 +10532,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10547,6 +10593,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10610,6 +10658,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11775,6 +11824,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11844,6 +11894,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11913,6 +11964,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11973,6 +12025,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12036,6 +12090,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index c3f3917..eee232a 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -136,17 +136,19 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: .LBB2_6: ; %bb18 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_cselect_b32 s13, -1, 0 -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13 -; GFX11-NEXT: s_and_b32 s13, s8, s13 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: s_and_b32 s13, s13, exec_lo +; GFX11-NEXT: v_readfirstlane_b32 s13, v0 +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_cselect_b32 s1, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 +; GFX11-NEXT: s_and_b32 s1, s8, s1 +; GFX11-NEXT: s_and_b32 s1, s1, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readfirstlane_b32 s19, v2 -; GFX11-NEXT: s_cselect_b32 s1, s19, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s1, s1, 1 +; GFX11-NEXT: s_cselect_b32 s1, s19, s13 ; GFX11-NEXT: s_and_b32 s13, 0xffff, s0 +; GFX11-NEXT: s_and_b32 s1, s1, 1 +; GFX11-NEXT: s_cmp_lg_u32 s13, 0 ; GFX11-NEXT: s_cselect_b32 s13, -1, 0 ; GFX11-NEXT: s_and_b32 s20, s9, exec_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 6dc9199..8748aff 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -8265,10 +8265,12 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_readlane_b32 s6, v1, s3 +; GFX12-NEXT: s_lshl_b32 s7, 1, s3 ; GFX12-NEXT: v_writelane_b32 v0, s0, s3 -; GFX12-NEXT: s_lshl_b32 s3, 1, s3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 s1, s1, s3 +; GFX12-NEXT: s_and_not1_b32 s1, s1, s7 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12-NEXT: s_add_f32 s0, s0, s6 ; GFX12-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX12-NEXT: ; %bb.6: ; %ComputeEnd @@ -8349,13 +8351,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX942-NEXT: .LBB28_5: ; %ComputeLoop ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX942-NEXT: v_readfirstlane_b32 s6, v1 -; GFX942-NEXT: s_mov_b32 m0, s3 -; GFX942-NEXT: v_readlane_b32 s8, v2, s3 -; GFX942-NEXT: v_writelane_b32 v0, s6, m0 ; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX942-NEXT: v_readfirstlane_b32 s8, v1 +; GFX942-NEXT: v_readlane_b32 s9, v2, s3 +; GFX942-NEXT: s_mov_b32 m0, s3 ; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX942-NEXT: v_add_f32_e32 v1, s8, v1 +; GFX942-NEXT: v_writelane_b32 v0, s8, m0 +; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX942-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX942-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX942-NEXT: ; %bb.6: ; %ComputeEnd ; GFX942-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8437,14 +8440,15 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: .LBB28_5: ; %ComputeLoop ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_ctz_i32_b32 s1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s3, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readlane_b32 s6, v2, s1 +; GFX11-NEXT: s_lshl_b32 s7, 1, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, s0, s7 ; GFX11-NEXT: v_writelane_b32 v0, s3, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX11-NEXT: s_lshl_b32 s1, 1, s1 -; GFX11-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX11-NEXT: ; %bb.6: ; %ComputeEnd ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8524,10 +8528,11 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10-NEXT: s_ff1_i32_b32 s1, s0 ; GFX10-NEXT: v_readfirstlane_b32 s3, v1 ; GFX10-NEXT: v_readlane_b32 s6, v2, s1 +; GFX10-NEXT: s_lshl_b32 s7, 1, s1 +; GFX10-NEXT: s_andn2_b32 s0, s0, s7 ; GFX10-NEXT: v_writelane_b32 v0, s3, s1 ; GFX10-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX10-NEXT: s_lshl_b32 s1, 1, s1 -; GFX10-NEXT: s_andn2_b32 s0, s0, s1 +; GFX10-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX10-NEXT: ; %bb.6: ; %ComputeEnd ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8604,13 +8609,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: .LBB28_5: ; %ComputeLoop ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX90A-NEXT: v_readfirstlane_b32 s6, v1 -; GFX90A-NEXT: s_mov_b32 m0, s3 -; GFX90A-NEXT: v_readlane_b32 s8, v2, s3 -; GFX90A-NEXT: v_writelane_b32 v0, s6, m0 ; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v1 +; GFX90A-NEXT: v_readlane_b32 s9, v2, s3 +; GFX90A-NEXT: s_mov_b32 m0, s3 ; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX90A-NEXT: v_add_f32_e32 v1, s8, v1 +; GFX90A-NEXT: v_writelane_b32 v0, s8, m0 +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX90A-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8686,13 +8692,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: .LBB28_5: ; %ComputeLoop ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX908-NEXT: v_readfirstlane_b32 s6, v1 -; GFX908-NEXT: s_mov_b32 m0, s3 -; GFX908-NEXT: v_readlane_b32 s8, v2, s3 -; GFX908-NEXT: v_writelane_b32 v0, s6, m0 ; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX908-NEXT: v_readfirstlane_b32 s8, v1 +; GFX908-NEXT: v_readlane_b32 s9, v2, s3 +; GFX908-NEXT: s_mov_b32 m0, s3 ; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX908-NEXT: v_add_f32_e32 v1, s8, v1 +; GFX908-NEXT: v_writelane_b32 v0, s8, m0 +; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX908-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX908-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX908-NEXT: ; %bb.6: ; %ComputeEnd ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8769,13 +8776,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: .LBB28_5: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX8-NEXT: v_readfirstlane_b32 s6, v1 -; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s8, v2, s3 -; GFX8-NEXT: v_writelane_b32 v0, s6, m0 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_readfirstlane_b32 s8, v1 +; GFX8-NEXT: v_readlane_b32 s9, v2, s3 +; GFX8-NEXT: s_mov_b32 m0, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: v_add_f32_e32 v1, s8, v1 +; GFX8-NEXT: v_writelane_b32 v0, s8, m0 +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX8-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX8-NEXT: ; %bb.6: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9122,10 +9130,12 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_readlane_b32 s6, v1, s3 +; GFX12-NEXT: s_lshl_b32 s7, 1, s3 ; GFX12-NEXT: v_writelane_b32 v0, s0, s3 -; GFX12-NEXT: s_lshl_b32 s3, 1, s3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 s1, s1, s3 +; GFX12-NEXT: s_and_not1_b32 s1, s1, s7 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12-NEXT: s_add_f32 s0, s0, s6 ; GFX12-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX12-NEXT: ; %bb.6: ; %ComputeEnd @@ -9202,13 +9212,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX942-NEXT: .LBB29_5: ; %ComputeLoop ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX942-NEXT: v_readfirstlane_b32 s6, v1 -; GFX942-NEXT: s_mov_b32 m0, s3 -; GFX942-NEXT: v_readlane_b32 s8, v2, s3 -; GFX942-NEXT: v_writelane_b32 v0, s6, m0 ; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX942-NEXT: v_readfirstlane_b32 s8, v1 +; GFX942-NEXT: v_readlane_b32 s9, v2, s3 +; GFX942-NEXT: s_mov_b32 m0, s3 ; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX942-NEXT: v_add_f32_e32 v1, s8, v1 +; GFX942-NEXT: v_writelane_b32 v0, s8, m0 +; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX942-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX942-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX942-NEXT: ; %bb.6: ; %ComputeEnd ; GFX942-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9285,14 +9296,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: .LBB29_5: ; %ComputeLoop ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_ctz_i32_b32 s1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s3, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readlane_b32 s6, v2, s1 +; GFX11-NEXT: s_lshl_b32 s7, 1, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, s0, s7 ; GFX11-NEXT: v_writelane_b32 v0, s3, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX11-NEXT: s_lshl_b32 s1, 1, s1 -; GFX11-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX11-NEXT: ; %bb.6: ; %ComputeEnd ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9365,10 +9377,11 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: s_ff1_i32_b32 s1, s0 ; GFX10-NEXT: v_readfirstlane_b32 s3, v1 ; GFX10-NEXT: v_readlane_b32 s6, v2, s1 +; GFX10-NEXT: s_lshl_b32 s7, 1, s1 +; GFX10-NEXT: s_andn2_b32 s0, s0, s7 ; GFX10-NEXT: v_writelane_b32 v0, s3, s1 ; GFX10-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX10-NEXT: s_lshl_b32 s1, 1, s1 -; GFX10-NEXT: s_andn2_b32 s0, s0, s1 +; GFX10-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX10-NEXT: ; %bb.6: ; %ComputeEnd ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9440,13 +9453,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: .LBB29_5: ; %ComputeLoop ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX90A-NEXT: v_readfirstlane_b32 s6, v1 -; GFX90A-NEXT: s_mov_b32 m0, s3 -; GFX90A-NEXT: v_readlane_b32 s8, v2, s3 -; GFX90A-NEXT: v_writelane_b32 v0, s6, m0 ; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v1 +; GFX90A-NEXT: v_readlane_b32 s9, v2, s3 +; GFX90A-NEXT: s_mov_b32 m0, s3 ; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX90A-NEXT: v_add_f32_e32 v1, s8, v1 +; GFX90A-NEXT: v_writelane_b32 v0, s8, m0 +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX90A-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9519,13 +9533,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: .LBB29_5: ; %ComputeLoop ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX908-NEXT: v_readfirstlane_b32 s6, v1 -; GFX908-NEXT: s_mov_b32 m0, s3 -; GFX908-NEXT: v_readlane_b32 s8, v2, s3 -; GFX908-NEXT: v_writelane_b32 v0, s6, m0 ; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX908-NEXT: v_readfirstlane_b32 s8, v1 +; GFX908-NEXT: v_readlane_b32 s9, v2, s3 +; GFX908-NEXT: s_mov_b32 m0, s3 ; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX908-NEXT: v_add_f32_e32 v1, s8, v1 +; GFX908-NEXT: v_writelane_b32 v0, s8, m0 +; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX908-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX908-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX908-NEXT: ; %bb.6: ; %ComputeEnd ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9599,13 +9614,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: .LBB29_5: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX8-NEXT: v_readfirstlane_b32 s6, v1 -; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s8, v2, s3 -; GFX8-NEXT: v_writelane_b32 v0, s6, m0 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_readfirstlane_b32 s8, v1 +; GFX8-NEXT: v_readlane_b32 s9, v2, s3 +; GFX8-NEXT: s_mov_b32 m0, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: v_add_f32_e32 v1, s8, v1 +; GFX8-NEXT: v_writelane_b32 v0, s8, m0 +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX8-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX8-NEXT: ; %bb.6: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir index fba42c4..c1cf06e 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir +++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir @@ -388,8 +388,9 @@ body: | ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: S_BITCMP1_B32 killed [[COPY]], 0, implicit-def $scc - ; GCN-NEXT: S_NOP 0, implicit $scc + ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY]], implicit-def $scc + ; GCN-NEXT: S_NOP 0, implicit killed $scc + ; GCN-NEXT: S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} @@ -416,80 +417,6 @@ body: | S_ENDPGM 0 ... ---- -name: xor_1_cmp_lg_0_killed_scc -body: | - ; GCN-LABEL: name: xor_1_cmp_lg_0_killed_scc - ; GCN: bb.0: - ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 1, killed [[COPY]], implicit-def $scc - ; GCN-NEXT: S_NOP 0, implicit $scc - ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc - ; GCN-NEXT: S_BRANCH %bb.1 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x80000000) - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.2: - ; GCN-NEXT: S_ENDPGM 0 - bb.0: - successors: %bb.1(0x40000000), %bb.2(0x40000000) - liveins: $sgpr0, $vgpr0_vgpr1 - - %0:sreg_32 = COPY $sgpr0 - %1:sreg_32 = S_XOR_B32 1, killed %0, implicit-def $scc - S_NOP 0, implicit killed $scc - S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc - S_CBRANCH_SCC0 %bb.2, implicit $scc - S_BRANCH %bb.1 - - bb.1: - successors: %bb.2(0x80000000) - - bb.2: - S_ENDPGM 0 - -... ---- -name: absdiff_1_cmp_lg_0_killed_scc -body: | - ; GCN-LABEL: name: absdiff_1_cmp_lg_0_killed_scc - ; GCN: bb.0: - ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: [[S_ABSDIFF_I32_:%[0-9]+]]:sreg_32 = S_ABSDIFF_I32 1, killed [[COPY]], implicit-def $scc - ; GCN-NEXT: S_NOP 0, implicit $scc - ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc - ; GCN-NEXT: S_BRANCH %bb.1 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x80000000) - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.2: - ; GCN-NEXT: S_ENDPGM 0 - bb.0: - successors: %bb.1(0x40000000), %bb.2(0x40000000) - liveins: $sgpr0, $vgpr0_vgpr1 - - %0:sreg_32 = COPY $sgpr0 - %1:sreg_32 = S_ABSDIFF_I32 1, killed %0, implicit-def $scc - S_NOP 0, implicit killed $scc - S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc - S_CBRANCH_SCC0 %bb.2, implicit $scc - S_BRANCH %bb.1 - - bb.1: - successors: %bb.2(0x80000000) - - bb.2: - S_ENDPGM 0 - -... --- name: and_1_cmp_eq_1_clobbered_scc @@ -2143,7 +2070,8 @@ body: | ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def $scc + ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def dead $scc + ; GCN-NEXT: S_CMP_LG_U32 killed [[S_AND_B32_]], 0, implicit-def $scc ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll index 19cc7f7..f53aaaa 100644 --- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll @@ -10,6 +10,7 @@ define amdgpu_ps i32 @shl32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: shl32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_lshl_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -24,6 +25,7 @@ define amdgpu_ps i32 @shl64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: shl64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -38,6 +40,7 @@ define amdgpu_ps i32 @lshr32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: lshr32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_lshr_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -52,6 +55,7 @@ define amdgpu_ps i32 @lshr64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: lshr64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -66,6 +70,7 @@ define amdgpu_ps i32 @ashr32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: ashr32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_ashr_i32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -80,6 +85,7 @@ define amdgpu_ps i32 @ashr64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: ashr64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_ashr_i64 s[0:1], s[0:1], s2 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -94,6 +100,7 @@ define amdgpu_ps i32 @abs32(i32 inreg %val0) { ; CHECK-LABEL: abs32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_abs_i32 s0, s0 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -114,6 +121,7 @@ define amdgpu_ps i32 @and32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: and32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_and_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -128,6 +136,7 @@ define amdgpu_ps i32 @and64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: and64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -142,6 +151,7 @@ define amdgpu_ps i32 @or32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: or32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_or_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -156,6 +166,7 @@ define amdgpu_ps i32 @or64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: or64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -170,6 +181,7 @@ define amdgpu_ps i32 @xor32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: xor32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_xor_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -184,6 +196,7 @@ define amdgpu_ps i32 @xor64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: xor64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -198,6 +211,7 @@ define amdgpu_ps i32 @nand32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: nand32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_nand_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -217,6 +231,7 @@ define amdgpu_ps i32 @nand64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: nand64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -236,6 +251,7 @@ define amdgpu_ps i32 @nor32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: nor32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_nor_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -255,6 +271,7 @@ define amdgpu_ps i32 @nor64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: nor64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -274,6 +291,7 @@ define amdgpu_ps i32 @xnor32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: xnor32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_xnor_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -293,6 +311,7 @@ define amdgpu_ps i32 @xnor64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: xnor64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_xnor_b64 s[0:1], s[0:1], s[2:3] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -312,6 +331,7 @@ define amdgpu_ps i32 @andn232(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: andn232: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_andn2_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -327,6 +347,7 @@ define amdgpu_ps i32 @nandn264(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: nandn264: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -342,6 +363,7 @@ define amdgpu_ps i32 @orn232(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: orn232: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_orn2_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -357,6 +379,7 @@ define amdgpu_ps i32 @orn264(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: orn264: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_orn2_b64 s[0:1], s[0:1], s[2:3] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -372,6 +395,7 @@ define amdgpu_ps i32 @bfe_i32(i32 inreg %val0) { ; CHECK-LABEL: bfe_i32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_bfe_i32 s0, s0, 0x80010 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -409,6 +433,7 @@ define amdgpu_ps i32 @bfe_u32(i32 inreg %val0) { ; CHECK-LABEL: bfe_u32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_bfe_u32 s0, s0, 0x80010 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -488,6 +513,7 @@ define amdgpu_ps i32 @bcnt132(i32 inreg %val0) { ; CHECK-LABEL: bcnt132: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -526,6 +552,7 @@ define amdgpu_ps i32 @quadmask32(i32 inreg %val0) { ; CHECK-LABEL: quadmask32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_quadmask_b32 s0, s0 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -544,6 +571,7 @@ define amdgpu_ps i32 @quadmask64(i64 inreg %val0) { ; CHECK-LABEL: quadmask64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_quadmask_b64 s[0:1], s[0:1] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -562,6 +590,7 @@ define amdgpu_ps i32 @not32(i32 inreg %val0) { ; CHECK-LABEL: not32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_not_b32 s0, s0 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -580,6 +609,7 @@ define amdgpu_ps i32 @not64(i64 inreg %val0) { ; CHECK-LABEL: not64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_not_b64 s[0:1], s[0:1] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll index 7552f6b..a828ee0 100644 --- a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll +++ b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll @@ -12,6 +12,8 @@ define amdgpu_ps i32 @s_uaddo_pseudo(i32 inreg %val0) { ; CHECK-LABEL: s_uaddo_pseudo: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s0, s0, 1 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_addc_u32 s0, 1, 0 ; CHECK-NEXT: ; return to shader part epilog %pair = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %val0, i32 1) @@ -30,6 +32,8 @@ define amdgpu_ps i32 @s_usubo_pseudo(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: s_usubo_pseudo: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_sub_u32 s0, s0, 1 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 ; CHECK-NEXT: s_subb_u32 s0, s1, 0 ; CHECK-NEXT: ; return to shader part epilog %pair = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %val0, i32 1) diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index 71f5a94..5f6d622 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -56,9 +56,10 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_addc_u32 s15, 0, s16 ; GCN-NEXT: s_add_u32 s16, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NEXT: v_mul_hi_u32 v0, s12, v0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: v_mul_hi_u32 v0, s12, v0 ; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s14, s14, s15 ; GCN-NEXT: s_mul_i32 s0, s12, s14 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 @@ -89,6 +90,7 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_add_u32 s15, s16, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s14, s14, s12 ; GCN-NEXT: s_ashr_i32 s12, s7, 31 ; GCN-NEXT: s_add_u32 s0, s6, s12 @@ -114,50 +116,52 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-NEXT: s_addc_u32 s4, s4, 0 ; GCN-NEXT: s_mul_i32 s14, s7, s14 -; GCN-NEXT: s_add_u32 s16, s1, s14 -; GCN-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NEXT: s_add_u32 s14, s1, s14 +; GCN-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 -; GCN-NEXT: s_addc_u32 s17, 0, s4 +; GCN-NEXT: s_addc_u32 s15, 0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_mul_i32 s4, s10, s17 +; GCN-NEXT: s_mul_i32 s4, s10, s15 ; GCN-NEXT: v_readfirstlane_b32 s5, v0 ; GCN-NEXT: s_add_i32 s4, s5, s4 -; GCN-NEXT: s_mul_i32 s5, s11, s16 -; GCN-NEXT: s_add_i32 s18, s4, s5 -; GCN-NEXT: s_sub_i32 s14, s7, s18 -; GCN-NEXT: s_mul_i32 s4, s10, s16 +; GCN-NEXT: s_mul_i32 s5, s11, s14 +; GCN-NEXT: s_add_i32 s16, s4, s5 +; GCN-NEXT: s_sub_i32 s17, s7, s16 +; GCN-NEXT: s_mul_i32 s4, s10, s14 ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s15, s4, s5 -; GCN-NEXT: s_subb_u32 s19, s14, s11 -; GCN-NEXT: s_sub_u32 s20, s6, s10 -; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GCN-NEXT: s_or_b32 s14, s14, s15 -; GCN-NEXT: s_subb_u32 s14, s19, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s11 -; GCN-NEXT: s_cselect_b32 s15, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s20, s10 -; GCN-NEXT: s_cselect_b32 s19, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s14, s11 -; GCN-NEXT: s_cselect_b32 s14, s19, s15 -; GCN-NEXT: s_add_u32 s15, s16, 1 -; GCN-NEXT: s_addc_u32 s19, s17, 0 -; GCN-NEXT: s_add_u32 s20, s16, 2 -; GCN-NEXT: s_addc_u32 s21, s17, 0 -; GCN-NEXT: s_cmp_lg_u32 s14, 0 -; GCN-NEXT: s_cselect_b32 s14, s20, s15 -; GCN-NEXT: s_cselect_b32 s15, s21, s19 +; GCN-NEXT: s_or_b32 s18, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s18, 0 +; GCN-NEXT: s_subb_u32 s17, s17, s11 +; GCN-NEXT: s_sub_u32 s19, s6, s10 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_subb_u32 s4, s7, s18 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_subb_u32 s4, s17, 0 ; GCN-NEXT: s_cmp_ge_u32 s4, s11 ; GCN-NEXT: s_cselect_b32 s5, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s6, s10 -; GCN-NEXT: s_cselect_b32 s6, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s19, s10 +; GCN-NEXT: s_cselect_b32 s17, -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s4, s11 -; GCN-NEXT: s_cselect_b32 s4, s6, s5 +; GCN-NEXT: s_cselect_b32 s4, s17, s5 +; GCN-NEXT: s_add_u32 s5, s14, 1 +; GCN-NEXT: s_addc_u32 s17, s15, 0 +; GCN-NEXT: s_add_u32 s19, s14, 2 +; GCN-NEXT: s_addc_u32 s20, s15, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_cselect_b32 s5, s15, s17 -; GCN-NEXT: s_cselect_b32 s4, s14, s16 +; GCN-NEXT: s_cselect_b32 s4, s19, s5 +; GCN-NEXT: s_cselect_b32 s5, s20, s17 +; GCN-NEXT: s_cmp_lg_u32 s18, 0 +; GCN-NEXT: s_subb_u32 s7, s7, s16 +; GCN-NEXT: s_cmp_ge_u32 s7, s11 +; GCN-NEXT: s_cselect_b32 s16, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s6, s10 +; GCN-NEXT: s_cselect_b32 s6, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s7, s11 +; GCN-NEXT: s_cselect_b32 s6, s6, s16 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cselect_b32 s5, s5, s15 +; GCN-NEXT: s_cselect_b32 s4, s4, s14 ; GCN-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9] ; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GCN-NEXT: s_sub_u32 s4, s4, s6 @@ -204,6 +208,7 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_add_u32 s18, s16, 1 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_or_b32 s10, s10, s11 +; GCN-IR-NEXT: s_cmp_lg_u32 s10, 0 ; GCN-IR-NEXT: s_addc_u32 s10, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s16, 63, s16 @@ -237,6 +242,7 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_or_b32 s20, s20, s21 +; GCN-IR-NEXT: s_cmp_lg_u32 s20, 0 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[12:13], s[8:9] @@ -1189,9 +1195,10 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_addc_u32 s12, 0, s13 ; GCN-NEXT: s_add_u32 s13, s8, s9 ; GCN-NEXT: v_mov_b32_e32 v0, s13 -; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: s_addc_u32 s11, s11, s12 ; GCN-NEXT: s_mul_i32 s8, s2, s11 ; GCN-NEXT: v_readfirstlane_b32 s9, v0 @@ -1222,6 +1229,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_add_u32 s2, s13, s2 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: s_addc_u32 s8, s11, s10 ; GCN-NEXT: v_mul_hi_u32 v1, s2, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s8, 24 @@ -1230,46 +1238,48 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_readfirstlane_b32 s10, v1 ; GCN-NEXT: v_readfirstlane_b32 s9, v0 ; GCN-NEXT: s_add_u32 s8, s10, s8 -; GCN-NEXT: s_addc_u32 s12, 0, s9 -; GCN-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NEXT: s_addc_u32 s10, 0, s9 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 -; GCN-NEXT: s_mul_i32 s8, s7, s12 +; GCN-NEXT: s_mul_i32 s8, s7, s10 ; GCN-NEXT: v_readfirstlane_b32 s9, v0 -; GCN-NEXT: s_add_i32 s13, s9, s8 -; GCN-NEXT: s_sub_i32 s10, 0, s13 -; GCN-NEXT: s_mul_i32 s8, s6, s12 -; GCN-NEXT: s_sub_u32 s14, 24, s8 +; GCN-NEXT: s_add_i32 s11, s9, s8 +; GCN-NEXT: s_sub_i32 s12, 0, s11 +; GCN-NEXT: s_mul_i32 s8, s6, s10 +; GCN-NEXT: s_sub_u32 s13, 24, s8 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_or_b32 s14, s8, s9 +; GCN-NEXT: s_cmp_lg_u32 s14, 0 +; GCN-NEXT: s_subb_u32 s12, s12, s7 +; GCN-NEXT: s_sub_u32 s15, s13, s6 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s11, s8, s9 -; GCN-NEXT: s_subb_u32 s15, s10, s7 -; GCN-NEXT: s_sub_u32 s16, s14, s6 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s10, s15, 0 -; GCN-NEXT: s_cmp_ge_u32 s10, s7 -; GCN-NEXT: s_cselect_b32 s11, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s16, s6 -; GCN-NEXT: s_cselect_b32 s15, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s10, s7 -; GCN-NEXT: s_cselect_b32 s10, s15, s11 -; GCN-NEXT: s_add_u32 s11, s12, 1 -; GCN-NEXT: s_addc_u32 s15, 0, 0 -; GCN-NEXT: s_add_u32 s16, s12, 2 -; GCN-NEXT: s_addc_u32 s17, 0, 0 -; GCN-NEXT: s_cmp_lg_u32 s10, 0 -; GCN-NEXT: s_cselect_b32 s10, s16, s11 -; GCN-NEXT: s_cselect_b32 s11, s17, s15 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s8, 0, s13 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_subb_u32 s8, s12, 0 ; GCN-NEXT: s_cmp_ge_u32 s8, s7 ; GCN-NEXT: s_cselect_b32 s9, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s6 -; GCN-NEXT: s_cselect_b32 s6, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s15, s6 +; GCN-NEXT: s_cselect_b32 s12, -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s8, s7 -; GCN-NEXT: s_cselect_b32 s6, s6, s9 +; GCN-NEXT: s_cselect_b32 s8, s12, s9 +; GCN-NEXT: s_add_u32 s9, s10, 1 +; GCN-NEXT: s_addc_u32 s12, 0, 0 +; GCN-NEXT: s_add_u32 s15, s10, 2 +; GCN-NEXT: s_addc_u32 s16, 0, 0 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cselect_b32 s8, s15, s9 +; GCN-NEXT: s_cselect_b32 s9, s16, s12 +; GCN-NEXT: s_cmp_lg_u32 s14, 0 +; GCN-NEXT: s_subb_u32 s11, 0, s11 +; GCN-NEXT: s_cmp_ge_u32 s11, s7 +; GCN-NEXT: s_cselect_b32 s12, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s13, s6 +; GCN-NEXT: s_cselect_b32 s6, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s11, s7 +; GCN-NEXT: s_cselect_b32 s6, s6, s12 ; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_cselect_b32 s7, s11, 0 -; GCN-NEXT: s_cselect_b32 s6, s10, s12 +; GCN-NEXT: s_cselect_b32 s7, s9, 0 +; GCN-NEXT: s_cselect_b32 s6, s8, s10 ; GCN-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_subb_u32 s7, s7, s4 @@ -1305,6 +1315,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s12, s10, 1 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_or_b32 s8, s8, s9 +; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-IR-NEXT: s_addc_u32 s8, s11, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s10, 63, s10 @@ -1337,6 +1348,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s16, s16, 1 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_or_b32 s18, s18, s19 +; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0 ; GCN-IR-NEXT: s_addc_u32 s17, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index e12e31b..bbd1793 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -1513,7 +1513,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GCN-NEXT: s_sub_u32 s3, 0, s8 -; GCN-NEXT: s_subb_u32 s10, 0, s9 +; GCN-NEXT: s_subb_u32 s12, 0, s9 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1522,52 +1522,56 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s11, v1 -; GCN-NEXT: v_readfirstlane_b32 s12, v0 -; GCN-NEXT: s_mul_i32 s13, s3, s11 -; GCN-NEXT: s_mul_hi_u32 s15, s3, s12 -; GCN-NEXT: s_mul_i32 s14, s10, s12 -; GCN-NEXT: s_add_i32 s13, s15, s13 -; GCN-NEXT: s_add_i32 s13, s13, s14 -; GCN-NEXT: s_mul_i32 s16, s3, s12 -; GCN-NEXT: s_mul_i32 s15, s12, s13 -; GCN-NEXT: s_mul_hi_u32 s17, s12, s16 -; GCN-NEXT: s_mul_hi_u32 s14, s12, s13 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-NEXT: s_mul_i32 s11, s3, s13 +; GCN-NEXT: s_mul_hi_u32 s15, s3, s10 +; GCN-NEXT: s_mul_i32 s14, s12, s10 +; GCN-NEXT: s_add_i32 s11, s15, s11 +; GCN-NEXT: s_add_i32 s11, s11, s14 +; GCN-NEXT: s_mul_i32 s16, s3, s10 +; GCN-NEXT: s_mul_i32 s15, s10, s11 +; GCN-NEXT: s_mul_hi_u32 s17, s10, s16 +; GCN-NEXT: s_mul_hi_u32 s14, s10, s11 ; GCN-NEXT: s_add_u32 s15, s17, s15 ; GCN-NEXT: s_addc_u32 s14, 0, s14 -; GCN-NEXT: s_mul_hi_u32 s18, s11, s16 -; GCN-NEXT: s_mul_i32 s16, s11, s16 +; GCN-NEXT: s_mul_hi_u32 s18, s13, s16 +; GCN-NEXT: s_mul_i32 s16, s13, s16 ; GCN-NEXT: s_add_u32 s15, s15, s16 -; GCN-NEXT: s_mul_hi_u32 s17, s11, s13 +; GCN-NEXT: s_mul_hi_u32 s17, s13, s11 ; GCN-NEXT: s_addc_u32 s14, s14, s18 ; GCN-NEXT: s_addc_u32 s15, s17, 0 -; GCN-NEXT: s_mul_i32 s13, s11, s13 -; GCN-NEXT: s_add_u32 s13, s14, s13 +; GCN-NEXT: s_mul_i32 s11, s13, s11 +; GCN-NEXT: s_add_u32 s11, s14, s11 ; GCN-NEXT: s_addc_u32 s14, 0, s15 -; GCN-NEXT: s_add_u32 s12, s12, s13 -; GCN-NEXT: s_addc_u32 s11, s11, s14 -; GCN-NEXT: s_mul_i32 s13, s3, s11 -; GCN-NEXT: s_mul_hi_u32 s14, s3, s12 -; GCN-NEXT: s_add_i32 s13, s14, s13 -; GCN-NEXT: s_mul_i32 s10, s10, s12 -; GCN-NEXT: s_add_i32 s13, s13, s10 -; GCN-NEXT: s_mul_i32 s3, s3, s12 -; GCN-NEXT: s_mul_hi_u32 s14, s11, s3 -; GCN-NEXT: s_mul_i32 s15, s11, s3 -; GCN-NEXT: s_mul_i32 s17, s12, s13 -; GCN-NEXT: s_mul_hi_u32 s3, s12, s3 -; GCN-NEXT: s_mul_hi_u32 s16, s12, s13 +; GCN-NEXT: s_add_u32 s15, s10, s11 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GCN-NEXT: s_addc_u32 s13, s13, s14 +; GCN-NEXT: s_mul_i32 s10, s3, s13 +; GCN-NEXT: s_mul_hi_u32 s11, s3, s15 +; GCN-NEXT: s_add_i32 s10, s11, s10 +; GCN-NEXT: s_mul_i32 s12, s12, s15 +; GCN-NEXT: s_add_i32 s10, s10, s12 +; GCN-NEXT: s_mul_i32 s3, s3, s15 +; GCN-NEXT: s_mul_hi_u32 s12, s13, s3 +; GCN-NEXT: s_mul_i32 s14, s13, s3 +; GCN-NEXT: s_mul_i32 s17, s15, s10 +; GCN-NEXT: s_mul_hi_u32 s3, s15, s3 +; GCN-NEXT: s_mul_hi_u32 s16, s15, s10 ; GCN-NEXT: s_add_u32 s3, s3, s17 ; GCN-NEXT: s_addc_u32 s16, 0, s16 -; GCN-NEXT: s_add_u32 s3, s3, s15 -; GCN-NEXT: s_mul_hi_u32 s10, s11, s13 -; GCN-NEXT: s_addc_u32 s3, s16, s14 -; GCN-NEXT: s_addc_u32 s10, s10, 0 -; GCN-NEXT: s_mul_i32 s13, s11, s13 -; GCN-NEXT: s_add_u32 s3, s3, s13 -; GCN-NEXT: s_addc_u32 s10, 0, s10 -; GCN-NEXT: s_add_u32 s3, s12, s3 -; GCN-NEXT: s_addc_u32 s14, s11, s10 +; GCN-NEXT: s_add_u32 s3, s3, s14 +; GCN-NEXT: s_mul_hi_u32 s11, s13, s10 +; GCN-NEXT: s_addc_u32 s3, s16, s12 +; GCN-NEXT: s_addc_u32 s11, s11, 0 +; GCN-NEXT: s_mul_i32 s10, s13, s10 +; GCN-NEXT: s_add_u32 s3, s3, s10 +; GCN-NEXT: s_addc_u32 s12, 0, s11 +; GCN-NEXT: s_add_u32 s3, s15, s3 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GCN-NEXT: s_addc_u32 s14, s13, s12 ; GCN-NEXT: s_ashr_i32 s10, s5, 31 ; GCN-NEXT: s_add_u32 s12, s4, s10 ; GCN-NEXT: s_mov_b32 s11, s10 @@ -1596,9 +1600,11 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: s_mul_i32 s3, s8, s3 ; GCN-NEXT: s_sub_u32 s3, s12, s3 ; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 ; GCN-NEXT: s_subb_u32 s12, s16, s9 ; GCN-NEXT: s_sub_u32 s18, s3, s8 ; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s19, s12, 0 ; GCN-NEXT: s_cmp_ge_u32 s19, s9 ; GCN-NEXT: s_cselect_b32 s20, -1, 0 @@ -1608,10 +1614,12 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: s_cselect_b32 s20, s21, s20 ; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s12, s12, s9 -; GCN-NEXT: s_sub_u32 s16, s18, s8 +; GCN-NEXT: s_sub_u32 s21, s18, s8 +; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s12, s12, 0 ; GCN-NEXT: s_cmp_lg_u32 s20, 0 -; GCN-NEXT: s_cselect_b32 s16, s16, s18 +; GCN-NEXT: s_cselect_b32 s16, s21, s18 ; GCN-NEXT: s_cselect_b32 s12, s12, s19 ; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 ; GCN-NEXT: s_subb_u32 s5, s13, s5 @@ -1923,9 +1931,11 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TONGA-NEXT: v_readfirstlane_b32 s14, v0 ; TONGA-NEXT: s_sub_u32 s12, s12, s14 ; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s3, s3, s7 ; TONGA-NEXT: s_sub_u32 s18, s12, s6 ; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s19, s3, 0 ; TONGA-NEXT: s_cmp_ge_u32 s19, s7 ; TONGA-NEXT: s_cselect_b32 s20, -1, 0 @@ -1935,10 +1945,12 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TONGA-NEXT: s_cselect_b32 s20, s21, s20 ; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s3, s3, s7 -; TONGA-NEXT: s_sub_u32 s16, s18, s6 +; TONGA-NEXT: s_sub_u32 s21, s18, s6 +; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s3, s3, 0 ; TONGA-NEXT: s_cmp_lg_u32 s20, 0 -; TONGA-NEXT: s_cselect_b32 s16, s16, s18 +; TONGA-NEXT: s_cselect_b32 s16, s21, s18 ; TONGA-NEXT: s_cselect_b32 s3, s3, s19 ; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s5, s13, s5 @@ -2718,7 +2730,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GCN-NEXT: s_sub_u32 s9, 0, s6 -; GCN-NEXT: s_subb_u32 s14, 0, s7 +; GCN-NEXT: s_subb_u32 s16, 0, s7 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2727,52 +2739,56 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s15, v1 -; GCN-NEXT: v_readfirstlane_b32 s16, v0 -; GCN-NEXT: s_mul_i32 s17, s9, s15 -; GCN-NEXT: s_mul_hi_u32 s19, s9, s16 -; GCN-NEXT: s_mul_i32 s18, s14, s16 -; GCN-NEXT: s_add_i32 s17, s19, s17 -; GCN-NEXT: s_add_i32 s17, s17, s18 -; GCN-NEXT: s_mul_i32 s20, s9, s16 -; GCN-NEXT: s_mul_i32 s19, s16, s17 -; GCN-NEXT: s_mul_hi_u32 s21, s16, s20 -; GCN-NEXT: s_mul_hi_u32 s18, s16, s17 +; GCN-NEXT: v_readfirstlane_b32 s17, v1 +; GCN-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-NEXT: s_mul_i32 s15, s9, s17 +; GCN-NEXT: s_mul_hi_u32 s19, s9, s14 +; GCN-NEXT: s_mul_i32 s18, s16, s14 +; GCN-NEXT: s_add_i32 s15, s19, s15 +; GCN-NEXT: s_add_i32 s15, s15, s18 +; GCN-NEXT: s_mul_i32 s20, s9, s14 +; GCN-NEXT: s_mul_i32 s19, s14, s15 +; GCN-NEXT: s_mul_hi_u32 s21, s14, s20 +; GCN-NEXT: s_mul_hi_u32 s18, s14, s15 ; GCN-NEXT: s_add_u32 s19, s21, s19 ; GCN-NEXT: s_addc_u32 s18, 0, s18 -; GCN-NEXT: s_mul_hi_u32 s22, s15, s20 -; GCN-NEXT: s_mul_i32 s20, s15, s20 +; GCN-NEXT: s_mul_hi_u32 s22, s17, s20 +; GCN-NEXT: s_mul_i32 s20, s17, s20 ; GCN-NEXT: s_add_u32 s19, s19, s20 -; GCN-NEXT: s_mul_hi_u32 s21, s15, s17 +; GCN-NEXT: s_mul_hi_u32 s21, s17, s15 ; GCN-NEXT: s_addc_u32 s18, s18, s22 ; GCN-NEXT: s_addc_u32 s19, s21, 0 -; GCN-NEXT: s_mul_i32 s17, s15, s17 -; GCN-NEXT: s_add_u32 s17, s18, s17 +; GCN-NEXT: s_mul_i32 s15, s17, s15 +; GCN-NEXT: s_add_u32 s15, s18, s15 ; GCN-NEXT: s_addc_u32 s18, 0, s19 -; GCN-NEXT: s_add_u32 s16, s16, s17 -; GCN-NEXT: s_addc_u32 s15, s15, s18 -; GCN-NEXT: s_mul_i32 s17, s9, s15 -; GCN-NEXT: s_mul_hi_u32 s18, s9, s16 -; GCN-NEXT: s_add_i32 s17, s18, s17 -; GCN-NEXT: s_mul_i32 s14, s14, s16 -; GCN-NEXT: s_add_i32 s17, s17, s14 -; GCN-NEXT: s_mul_i32 s9, s9, s16 -; GCN-NEXT: s_mul_hi_u32 s18, s15, s9 -; GCN-NEXT: s_mul_i32 s19, s15, s9 -; GCN-NEXT: s_mul_i32 s21, s16, s17 -; GCN-NEXT: s_mul_hi_u32 s9, s16, s9 -; GCN-NEXT: s_mul_hi_u32 s20, s16, s17 +; GCN-NEXT: s_add_u32 s19, s14, s15 +; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GCN-NEXT: s_addc_u32 s17, s17, s18 +; GCN-NEXT: s_mul_i32 s14, s9, s17 +; GCN-NEXT: s_mul_hi_u32 s15, s9, s19 +; GCN-NEXT: s_add_i32 s14, s15, s14 +; GCN-NEXT: s_mul_i32 s16, s16, s19 +; GCN-NEXT: s_add_i32 s14, s14, s16 +; GCN-NEXT: s_mul_i32 s9, s9, s19 +; GCN-NEXT: s_mul_hi_u32 s16, s17, s9 +; GCN-NEXT: s_mul_i32 s18, s17, s9 +; GCN-NEXT: s_mul_i32 s21, s19, s14 +; GCN-NEXT: s_mul_hi_u32 s9, s19, s9 +; GCN-NEXT: s_mul_hi_u32 s20, s19, s14 ; GCN-NEXT: s_add_u32 s9, s9, s21 ; GCN-NEXT: s_addc_u32 s20, 0, s20 -; GCN-NEXT: s_add_u32 s9, s9, s19 -; GCN-NEXT: s_mul_hi_u32 s14, s15, s17 -; GCN-NEXT: s_addc_u32 s9, s20, s18 -; GCN-NEXT: s_addc_u32 s14, s14, 0 -; GCN-NEXT: s_mul_i32 s17, s15, s17 -; GCN-NEXT: s_add_u32 s9, s9, s17 -; GCN-NEXT: s_addc_u32 s14, 0, s14 -; GCN-NEXT: s_add_u32 s9, s16, s9 -; GCN-NEXT: s_addc_u32 s18, s15, s14 +; GCN-NEXT: s_add_u32 s9, s9, s18 +; GCN-NEXT: s_mul_hi_u32 s15, s17, s14 +; GCN-NEXT: s_addc_u32 s9, s20, s16 +; GCN-NEXT: s_addc_u32 s15, s15, 0 +; GCN-NEXT: s_mul_i32 s14, s17, s14 +; GCN-NEXT: s_add_u32 s9, s9, s14 +; GCN-NEXT: s_addc_u32 s16, 0, s15 +; GCN-NEXT: s_add_u32 s9, s19, s9 +; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GCN-NEXT: s_addc_u32 s18, s17, s16 ; GCN-NEXT: s_ashr_i32 s14, s11, 31 ; GCN-NEXT: s_add_u32 s16, s10, s14 ; GCN-NEXT: s_mov_b32 s15, s14 @@ -2801,9 +2817,11 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s9, s6, s9 ; GCN-NEXT: s_sub_u32 s9, s16, s9 ; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s16, s20, s7 ; GCN-NEXT: s_sub_u32 s22, s9, s6 ; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s23, s16, 0 ; GCN-NEXT: s_cmp_ge_u32 s23, s7 ; GCN-NEXT: s_cselect_b32 s24, -1, 0 @@ -2813,10 +2831,12 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s24, s25, s24 ; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s16, s16, s7 -; GCN-NEXT: s_sub_u32 s20, s22, s6 +; GCN-NEXT: s_sub_u32 s25, s22, s6 +; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s16, s16, 0 ; GCN-NEXT: s_cmp_lg_u32 s24, 0 -; GCN-NEXT: s_cselect_b32 s20, s20, s22 +; GCN-NEXT: s_cselect_b32 s20, s25, s22 ; GCN-NEXT: s_cselect_b32 s16, s16, s23 ; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s11, s17, s11 @@ -2867,7 +2887,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GCN-NEXT: s_sub_u32 s3, 0, s10 -; GCN-NEXT: s_subb_u32 s12, 0, s11 +; GCN-NEXT: s_subb_u32 s14, 0, s11 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2876,52 +2896,56 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s13, v1 -; GCN-NEXT: v_readfirstlane_b32 s14, v0 -; GCN-NEXT: s_mul_i32 s15, s3, s13 -; GCN-NEXT: s_mul_hi_u32 s17, s3, s14 -; GCN-NEXT: s_mul_i32 s16, s12, s14 -; GCN-NEXT: s_add_i32 s15, s17, s15 -; GCN-NEXT: s_add_i32 s15, s15, s16 -; GCN-NEXT: s_mul_i32 s18, s3, s14 -; GCN-NEXT: s_mul_i32 s17, s14, s15 -; GCN-NEXT: s_mul_hi_u32 s19, s14, s18 -; GCN-NEXT: s_mul_hi_u32 s16, s14, s15 +; GCN-NEXT: v_readfirstlane_b32 s15, v1 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_mul_i32 s13, s3, s15 +; GCN-NEXT: s_mul_hi_u32 s17, s3, s12 +; GCN-NEXT: s_mul_i32 s16, s14, s12 +; GCN-NEXT: s_add_i32 s13, s17, s13 +; GCN-NEXT: s_add_i32 s13, s13, s16 +; GCN-NEXT: s_mul_i32 s18, s3, s12 +; GCN-NEXT: s_mul_i32 s17, s12, s13 +; GCN-NEXT: s_mul_hi_u32 s19, s12, s18 +; GCN-NEXT: s_mul_hi_u32 s16, s12, s13 ; GCN-NEXT: s_add_u32 s17, s19, s17 ; GCN-NEXT: s_addc_u32 s16, 0, s16 -; GCN-NEXT: s_mul_hi_u32 s20, s13, s18 -; GCN-NEXT: s_mul_i32 s18, s13, s18 +; GCN-NEXT: s_mul_hi_u32 s20, s15, s18 +; GCN-NEXT: s_mul_i32 s18, s15, s18 ; GCN-NEXT: s_add_u32 s17, s17, s18 -; GCN-NEXT: s_mul_hi_u32 s19, s13, s15 +; GCN-NEXT: s_mul_hi_u32 s19, s15, s13 ; GCN-NEXT: s_addc_u32 s16, s16, s20 ; GCN-NEXT: s_addc_u32 s17, s19, 0 -; GCN-NEXT: s_mul_i32 s15, s13, s15 -; GCN-NEXT: s_add_u32 s15, s16, s15 +; GCN-NEXT: s_mul_i32 s13, s15, s13 +; GCN-NEXT: s_add_u32 s13, s16, s13 ; GCN-NEXT: s_addc_u32 s16, 0, s17 -; GCN-NEXT: s_add_u32 s14, s14, s15 -; GCN-NEXT: s_addc_u32 s13, s13, s16 -; GCN-NEXT: s_mul_i32 s15, s3, s13 -; GCN-NEXT: s_mul_hi_u32 s16, s3, s14 -; GCN-NEXT: s_add_i32 s15, s16, s15 -; GCN-NEXT: s_mul_i32 s12, s12, s14 -; GCN-NEXT: s_add_i32 s15, s15, s12 -; GCN-NEXT: s_mul_i32 s3, s3, s14 -; GCN-NEXT: s_mul_hi_u32 s16, s13, s3 -; GCN-NEXT: s_mul_i32 s17, s13, s3 -; GCN-NEXT: s_mul_i32 s19, s14, s15 -; GCN-NEXT: s_mul_hi_u32 s3, s14, s3 -; GCN-NEXT: s_mul_hi_u32 s18, s14, s15 +; GCN-NEXT: s_add_u32 s17, s12, s13 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GCN-NEXT: s_addc_u32 s15, s15, s16 +; GCN-NEXT: s_mul_i32 s12, s3, s15 +; GCN-NEXT: s_mul_hi_u32 s13, s3, s17 +; GCN-NEXT: s_add_i32 s12, s13, s12 +; GCN-NEXT: s_mul_i32 s14, s14, s17 +; GCN-NEXT: s_add_i32 s12, s12, s14 +; GCN-NEXT: s_mul_i32 s3, s3, s17 +; GCN-NEXT: s_mul_hi_u32 s14, s15, s3 +; GCN-NEXT: s_mul_i32 s16, s15, s3 +; GCN-NEXT: s_mul_i32 s19, s17, s12 +; GCN-NEXT: s_mul_hi_u32 s3, s17, s3 +; GCN-NEXT: s_mul_hi_u32 s18, s17, s12 ; GCN-NEXT: s_add_u32 s3, s3, s19 ; GCN-NEXT: s_addc_u32 s18, 0, s18 -; GCN-NEXT: s_add_u32 s3, s3, s17 -; GCN-NEXT: s_mul_hi_u32 s12, s13, s15 -; GCN-NEXT: s_addc_u32 s3, s18, s16 -; GCN-NEXT: s_addc_u32 s12, s12, 0 -; GCN-NEXT: s_mul_i32 s15, s13, s15 -; GCN-NEXT: s_add_u32 s3, s3, s15 -; GCN-NEXT: s_addc_u32 s12, 0, s12 -; GCN-NEXT: s_add_u32 s3, s14, s3 -; GCN-NEXT: s_addc_u32 s16, s13, s12 +; GCN-NEXT: s_add_u32 s3, s3, s16 +; GCN-NEXT: s_mul_hi_u32 s13, s15, s12 +; GCN-NEXT: s_addc_u32 s3, s18, s14 +; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_mul_i32 s12, s15, s12 +; GCN-NEXT: s_add_u32 s3, s3, s12 +; GCN-NEXT: s_addc_u32 s14, 0, s13 +; GCN-NEXT: s_add_u32 s3, s17, s3 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GCN-NEXT: s_addc_u32 s16, s15, s14 ; GCN-NEXT: s_ashr_i32 s12, s5, 31 ; GCN-NEXT: s_add_u32 s14, s4, s12 ; GCN-NEXT: s_mov_b32 s13, s12 @@ -2950,9 +2974,11 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s3, s10, s3 ; GCN-NEXT: s_sub_u32 s3, s14, s3 ; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s14, s18, s11 ; GCN-NEXT: s_sub_u32 s20, s3, s10 ; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s21, s14, 0 ; GCN-NEXT: s_cmp_ge_u32 s21, s11 ; GCN-NEXT: s_cselect_b32 s22, -1, 0 @@ -2962,10 +2988,12 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s22, s23, s22 ; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s14, s14, s11 -; GCN-NEXT: s_sub_u32 s18, s20, s10 +; GCN-NEXT: s_sub_u32 s23, s20, s10 +; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s14, s14, 0 ; GCN-NEXT: s_cmp_lg_u32 s22, 0 -; GCN-NEXT: s_cselect_b32 s18, s18, s20 +; GCN-NEXT: s_cselect_b32 s18, s23, s20 ; GCN-NEXT: s_cselect_b32 s14, s14, s21 ; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s5, s15, s5 @@ -3435,9 +3463,11 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_readfirstlane_b32 s14, v0 ; TONGA-NEXT: s_sub_u32 s12, s12, s14 ; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, s7 ; TONGA-NEXT: s_sub_u32 s18, s12, s6 ; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s19, s1, 0 ; TONGA-NEXT: s_cmp_ge_u32 s19, s7 ; TONGA-NEXT: s_cselect_b32 s20, -1, 0 @@ -3447,10 +3477,12 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: s_cselect_b32 s20, s21, s20 ; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, s7 -; TONGA-NEXT: s_sub_u32 s16, s18, s6 +; TONGA-NEXT: s_sub_u32 s21, s18, s6 +; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, 0 ; TONGA-NEXT: s_cmp_lg_u32 s20, 0 -; TONGA-NEXT: s_cselect_b32 s16, s16, s18 +; TONGA-NEXT: s_cselect_b32 s16, s21, s18 ; TONGA-NEXT: s_cselect_b32 s1, s1, s19 ; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s3, s13, s3 @@ -4902,7 +4934,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GCN-NEXT: s_sub_u32 s17, 0, s6 -; GCN-NEXT: s_subb_u32 s22, 0, s7 +; GCN-NEXT: s_subb_u32 s24, 0, s7 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -4911,52 +4943,56 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s23, v1 -; GCN-NEXT: v_readfirstlane_b32 s24, v0 -; GCN-NEXT: s_mul_i32 s25, s17, s23 -; GCN-NEXT: s_mul_hi_u32 s27, s17, s24 -; GCN-NEXT: s_mul_i32 s26, s22, s24 -; GCN-NEXT: s_add_i32 s25, s27, s25 -; GCN-NEXT: s_add_i32 s25, s25, s26 -; GCN-NEXT: s_mul_i32 s28, s17, s24 -; GCN-NEXT: s_mul_i32 s27, s24, s25 -; GCN-NEXT: s_mul_hi_u32 s29, s24, s28 -; GCN-NEXT: s_mul_hi_u32 s26, s24, s25 +; GCN-NEXT: v_readfirstlane_b32 s25, v1 +; GCN-NEXT: v_readfirstlane_b32 s22, v0 +; GCN-NEXT: s_mul_i32 s23, s17, s25 +; GCN-NEXT: s_mul_hi_u32 s27, s17, s22 +; GCN-NEXT: s_mul_i32 s26, s24, s22 +; GCN-NEXT: s_add_i32 s23, s27, s23 +; GCN-NEXT: s_add_i32 s23, s23, s26 +; GCN-NEXT: s_mul_i32 s28, s17, s22 +; GCN-NEXT: s_mul_i32 s27, s22, s23 +; GCN-NEXT: s_mul_hi_u32 s29, s22, s28 +; GCN-NEXT: s_mul_hi_u32 s26, s22, s23 ; GCN-NEXT: s_add_u32 s27, s29, s27 ; GCN-NEXT: s_addc_u32 s26, 0, s26 -; GCN-NEXT: s_mul_hi_u32 s30, s23, s28 -; GCN-NEXT: s_mul_i32 s28, s23, s28 +; GCN-NEXT: s_mul_hi_u32 s30, s25, s28 +; GCN-NEXT: s_mul_i32 s28, s25, s28 ; GCN-NEXT: s_add_u32 s27, s27, s28 -; GCN-NEXT: s_mul_hi_u32 s29, s23, s25 +; GCN-NEXT: s_mul_hi_u32 s29, s25, s23 ; GCN-NEXT: s_addc_u32 s26, s26, s30 ; GCN-NEXT: s_addc_u32 s27, s29, 0 -; GCN-NEXT: s_mul_i32 s25, s23, s25 -; GCN-NEXT: s_add_u32 s25, s26, s25 +; GCN-NEXT: s_mul_i32 s23, s25, s23 +; GCN-NEXT: s_add_u32 s23, s26, s23 ; GCN-NEXT: s_addc_u32 s26, 0, s27 -; GCN-NEXT: s_add_u32 s24, s24, s25 -; GCN-NEXT: s_addc_u32 s23, s23, s26 -; GCN-NEXT: s_mul_i32 s25, s17, s23 -; GCN-NEXT: s_mul_hi_u32 s26, s17, s24 -; GCN-NEXT: s_add_i32 s25, s26, s25 -; GCN-NEXT: s_mul_i32 s22, s22, s24 -; GCN-NEXT: s_add_i32 s25, s25, s22 -; GCN-NEXT: s_mul_i32 s17, s17, s24 -; GCN-NEXT: s_mul_hi_u32 s26, s23, s17 -; GCN-NEXT: s_mul_i32 s27, s23, s17 -; GCN-NEXT: s_mul_i32 s29, s24, s25 -; GCN-NEXT: s_mul_hi_u32 s17, s24, s17 -; GCN-NEXT: s_mul_hi_u32 s28, s24, s25 +; GCN-NEXT: s_add_u32 s27, s22, s23 +; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 +; GCN-NEXT: s_addc_u32 s25, s25, s26 +; GCN-NEXT: s_mul_i32 s22, s17, s25 +; GCN-NEXT: s_mul_hi_u32 s23, s17, s27 +; GCN-NEXT: s_add_i32 s22, s23, s22 +; GCN-NEXT: s_mul_i32 s24, s24, s27 +; GCN-NEXT: s_add_i32 s22, s22, s24 +; GCN-NEXT: s_mul_i32 s17, s17, s27 +; GCN-NEXT: s_mul_hi_u32 s24, s25, s17 +; GCN-NEXT: s_mul_i32 s26, s25, s17 +; GCN-NEXT: s_mul_i32 s29, s27, s22 +; GCN-NEXT: s_mul_hi_u32 s17, s27, s17 +; GCN-NEXT: s_mul_hi_u32 s28, s27, s22 ; GCN-NEXT: s_add_u32 s17, s17, s29 ; GCN-NEXT: s_addc_u32 s28, 0, s28 -; GCN-NEXT: s_add_u32 s17, s17, s27 -; GCN-NEXT: s_mul_hi_u32 s22, s23, s25 -; GCN-NEXT: s_addc_u32 s17, s28, s26 -; GCN-NEXT: s_addc_u32 s22, s22, 0 -; GCN-NEXT: s_mul_i32 s25, s23, s25 -; GCN-NEXT: s_add_u32 s17, s17, s25 -; GCN-NEXT: s_addc_u32 s22, 0, s22 -; GCN-NEXT: s_add_u32 s17, s24, s17 -; GCN-NEXT: s_addc_u32 s26, s23, s22 +; GCN-NEXT: s_add_u32 s17, s17, s26 +; GCN-NEXT: s_mul_hi_u32 s23, s25, s22 +; GCN-NEXT: s_addc_u32 s17, s28, s24 +; GCN-NEXT: s_addc_u32 s23, s23, 0 +; GCN-NEXT: s_mul_i32 s22, s25, s22 +; GCN-NEXT: s_add_u32 s17, s17, s22 +; GCN-NEXT: s_addc_u32 s24, 0, s23 +; GCN-NEXT: s_add_u32 s17, s27, s17 +; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 +; GCN-NEXT: s_addc_u32 s26, s25, s24 ; GCN-NEXT: s_ashr_i32 s22, s19, 31 ; GCN-NEXT: s_add_u32 s24, s18, s22 ; GCN-NEXT: s_mov_b32 s23, s22 @@ -4985,9 +5021,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s17, s6, s17 ; GCN-NEXT: s_sub_u32 s17, s24, s17 ; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s24, s28, s7 ; GCN-NEXT: s_sub_u32 s30, s17, s6 ; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0 ; GCN-NEXT: s_subb_u32 s31, s24, 0 ; GCN-NEXT: s_cmp_ge_u32 s31, s7 ; GCN-NEXT: s_cselect_b32 s33, -1, 0 @@ -4997,10 +5035,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s33, s34, s33 ; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0 ; GCN-NEXT: s_subb_u32 s24, s24, s7 -; GCN-NEXT: s_sub_u32 s28, s30, s6 +; GCN-NEXT: s_sub_u32 s34, s30, s6 +; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0 ; GCN-NEXT: s_subb_u32 s24, s24, 0 ; GCN-NEXT: s_cmp_lg_u32 s33, 0 -; GCN-NEXT: s_cselect_b32 s28, s28, s30 +; GCN-NEXT: s_cselect_b32 s28, s34, s30 ; GCN-NEXT: s_cselect_b32 s24, s24, s31 ; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s19, s25, s19 @@ -5051,7 +5091,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s18 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s19 ; GCN-NEXT: s_sub_u32 s13, 0, s18 -; GCN-NEXT: s_subb_u32 s20, 0, s19 +; GCN-NEXT: s_subb_u32 s22, 0, s19 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -5060,52 +5100,56 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s21, v1 -; GCN-NEXT: v_readfirstlane_b32 s22, v0 -; GCN-NEXT: s_mul_i32 s23, s13, s21 -; GCN-NEXT: s_mul_hi_u32 s25, s13, s22 -; GCN-NEXT: s_mul_i32 s24, s20, s22 -; GCN-NEXT: s_add_i32 s23, s25, s23 -; GCN-NEXT: s_add_i32 s23, s23, s24 -; GCN-NEXT: s_mul_i32 s26, s13, s22 -; GCN-NEXT: s_mul_i32 s25, s22, s23 -; GCN-NEXT: s_mul_hi_u32 s27, s22, s26 -; GCN-NEXT: s_mul_hi_u32 s24, s22, s23 +; GCN-NEXT: v_readfirstlane_b32 s23, v1 +; GCN-NEXT: v_readfirstlane_b32 s20, v0 +; GCN-NEXT: s_mul_i32 s21, s13, s23 +; GCN-NEXT: s_mul_hi_u32 s25, s13, s20 +; GCN-NEXT: s_mul_i32 s24, s22, s20 +; GCN-NEXT: s_add_i32 s21, s25, s21 +; GCN-NEXT: s_add_i32 s21, s21, s24 +; GCN-NEXT: s_mul_i32 s26, s13, s20 +; GCN-NEXT: s_mul_i32 s25, s20, s21 +; GCN-NEXT: s_mul_hi_u32 s27, s20, s26 +; GCN-NEXT: s_mul_hi_u32 s24, s20, s21 ; GCN-NEXT: s_add_u32 s25, s27, s25 ; GCN-NEXT: s_addc_u32 s24, 0, s24 -; GCN-NEXT: s_mul_hi_u32 s28, s21, s26 -; GCN-NEXT: s_mul_i32 s26, s21, s26 +; GCN-NEXT: s_mul_hi_u32 s28, s23, s26 +; GCN-NEXT: s_mul_i32 s26, s23, s26 ; GCN-NEXT: s_add_u32 s25, s25, s26 -; GCN-NEXT: s_mul_hi_u32 s27, s21, s23 +; GCN-NEXT: s_mul_hi_u32 s27, s23, s21 ; GCN-NEXT: s_addc_u32 s24, s24, s28 ; GCN-NEXT: s_addc_u32 s25, s27, 0 -; GCN-NEXT: s_mul_i32 s23, s21, s23 -; GCN-NEXT: s_add_u32 s23, s24, s23 +; GCN-NEXT: s_mul_i32 s21, s23, s21 +; GCN-NEXT: s_add_u32 s21, s24, s21 ; GCN-NEXT: s_addc_u32 s24, 0, s25 -; GCN-NEXT: s_add_u32 s22, s22, s23 -; GCN-NEXT: s_addc_u32 s21, s21, s24 -; GCN-NEXT: s_mul_i32 s23, s13, s21 -; GCN-NEXT: s_mul_hi_u32 s24, s13, s22 -; GCN-NEXT: s_add_i32 s23, s24, s23 -; GCN-NEXT: s_mul_i32 s20, s20, s22 -; GCN-NEXT: s_add_i32 s23, s23, s20 -; GCN-NEXT: s_mul_i32 s13, s13, s22 -; GCN-NEXT: s_mul_hi_u32 s24, s21, s13 -; GCN-NEXT: s_mul_i32 s25, s21, s13 -; GCN-NEXT: s_mul_i32 s27, s22, s23 -; GCN-NEXT: s_mul_hi_u32 s13, s22, s13 -; GCN-NEXT: s_mul_hi_u32 s26, s22, s23 +; GCN-NEXT: s_add_u32 s25, s20, s21 +; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 +; GCN-NEXT: s_addc_u32 s23, s23, s24 +; GCN-NEXT: s_mul_i32 s20, s13, s23 +; GCN-NEXT: s_mul_hi_u32 s21, s13, s25 +; GCN-NEXT: s_add_i32 s20, s21, s20 +; GCN-NEXT: s_mul_i32 s22, s22, s25 +; GCN-NEXT: s_add_i32 s20, s20, s22 +; GCN-NEXT: s_mul_i32 s13, s13, s25 +; GCN-NEXT: s_mul_hi_u32 s22, s23, s13 +; GCN-NEXT: s_mul_i32 s24, s23, s13 +; GCN-NEXT: s_mul_i32 s27, s25, s20 +; GCN-NEXT: s_mul_hi_u32 s13, s25, s13 +; GCN-NEXT: s_mul_hi_u32 s26, s25, s20 ; GCN-NEXT: s_add_u32 s13, s13, s27 ; GCN-NEXT: s_addc_u32 s26, 0, s26 -; GCN-NEXT: s_add_u32 s13, s13, s25 -; GCN-NEXT: s_mul_hi_u32 s20, s21, s23 -; GCN-NEXT: s_addc_u32 s13, s26, s24 -; GCN-NEXT: s_addc_u32 s20, s20, 0 -; GCN-NEXT: s_mul_i32 s23, s21, s23 -; GCN-NEXT: s_add_u32 s13, s13, s23 -; GCN-NEXT: s_addc_u32 s20, 0, s20 -; GCN-NEXT: s_add_u32 s13, s22, s13 -; GCN-NEXT: s_addc_u32 s24, s21, s20 +; GCN-NEXT: s_add_u32 s13, s13, s24 +; GCN-NEXT: s_mul_hi_u32 s21, s23, s20 +; GCN-NEXT: s_addc_u32 s13, s26, s22 +; GCN-NEXT: s_addc_u32 s21, s21, 0 +; GCN-NEXT: s_mul_i32 s20, s23, s20 +; GCN-NEXT: s_add_u32 s13, s13, s20 +; GCN-NEXT: s_addc_u32 s22, 0, s21 +; GCN-NEXT: s_add_u32 s13, s25, s13 +; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 +; GCN-NEXT: s_addc_u32 s24, s23, s22 ; GCN-NEXT: s_ashr_i32 s20, s15, 31 ; GCN-NEXT: s_add_u32 s22, s14, s20 ; GCN-NEXT: s_mov_b32 s21, s20 @@ -5134,9 +5178,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s13, s18, s13 ; GCN-NEXT: s_sub_u32 s13, s22, s13 ; GCN-NEXT: s_cselect_b64 s[24:25], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0 ; GCN-NEXT: s_subb_u32 s22, s26, s19 ; GCN-NEXT: s_sub_u32 s28, s13, s18 ; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s29, s22, 0 ; GCN-NEXT: s_cmp_ge_u32 s29, s19 ; GCN-NEXT: s_cselect_b32 s30, -1, 0 @@ -5146,10 +5192,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s30, s31, s30 ; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s22, s22, s19 -; GCN-NEXT: s_sub_u32 s26, s28, s18 +; GCN-NEXT: s_sub_u32 s31, s28, s18 +; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s22, s22, 0 ; GCN-NEXT: s_cmp_lg_u32 s30, 0 -; GCN-NEXT: s_cselect_b32 s26, s26, s28 +; GCN-NEXT: s_cselect_b32 s26, s31, s28 ; GCN-NEXT: s_cselect_b32 s22, s22, s29 ; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0 ; GCN-NEXT: s_subb_u32 s15, s23, s15 @@ -5209,7 +5257,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s14 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s15 ; GCN-NEXT: s_sub_u32 s9, 0, s14 -; GCN-NEXT: s_subb_u32 s16, 0, s15 +; GCN-NEXT: s_subb_u32 s18, 0, s15 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -5218,52 +5266,56 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s17, v1 -; GCN-NEXT: v_readfirstlane_b32 s18, v0 -; GCN-NEXT: s_mul_i32 s19, s9, s17 -; GCN-NEXT: s_mul_hi_u32 s21, s9, s18 -; GCN-NEXT: s_mul_i32 s20, s16, s18 -; GCN-NEXT: s_add_i32 s19, s21, s19 -; GCN-NEXT: s_add_i32 s19, s19, s20 -; GCN-NEXT: s_mul_i32 s22, s9, s18 -; GCN-NEXT: s_mul_i32 s21, s18, s19 -; GCN-NEXT: s_mul_hi_u32 s23, s18, s22 -; GCN-NEXT: s_mul_hi_u32 s20, s18, s19 +; GCN-NEXT: v_readfirstlane_b32 s19, v1 +; GCN-NEXT: v_readfirstlane_b32 s16, v0 +; GCN-NEXT: s_mul_i32 s17, s9, s19 +; GCN-NEXT: s_mul_hi_u32 s21, s9, s16 +; GCN-NEXT: s_mul_i32 s20, s18, s16 +; GCN-NEXT: s_add_i32 s17, s21, s17 +; GCN-NEXT: s_add_i32 s17, s17, s20 +; GCN-NEXT: s_mul_i32 s22, s9, s16 +; GCN-NEXT: s_mul_i32 s21, s16, s17 +; GCN-NEXT: s_mul_hi_u32 s23, s16, s22 +; GCN-NEXT: s_mul_hi_u32 s20, s16, s17 ; GCN-NEXT: s_add_u32 s21, s23, s21 ; GCN-NEXT: s_addc_u32 s20, 0, s20 -; GCN-NEXT: s_mul_hi_u32 s24, s17, s22 -; GCN-NEXT: s_mul_i32 s22, s17, s22 +; GCN-NEXT: s_mul_hi_u32 s24, s19, s22 +; GCN-NEXT: s_mul_i32 s22, s19, s22 ; GCN-NEXT: s_add_u32 s21, s21, s22 -; GCN-NEXT: s_mul_hi_u32 s23, s17, s19 +; GCN-NEXT: s_mul_hi_u32 s23, s19, s17 ; GCN-NEXT: s_addc_u32 s20, s20, s24 ; GCN-NEXT: s_addc_u32 s21, s23, 0 -; GCN-NEXT: s_mul_i32 s19, s17, s19 -; GCN-NEXT: s_add_u32 s19, s20, s19 +; GCN-NEXT: s_mul_i32 s17, s19, s17 +; GCN-NEXT: s_add_u32 s17, s20, s17 ; GCN-NEXT: s_addc_u32 s20, 0, s21 -; GCN-NEXT: s_add_u32 s18, s18, s19 -; GCN-NEXT: s_addc_u32 s17, s17, s20 -; GCN-NEXT: s_mul_i32 s19, s9, s17 -; GCN-NEXT: s_mul_hi_u32 s20, s9, s18 -; GCN-NEXT: s_add_i32 s19, s20, s19 -; GCN-NEXT: s_mul_i32 s16, s16, s18 -; GCN-NEXT: s_add_i32 s19, s19, s16 -; GCN-NEXT: s_mul_i32 s9, s9, s18 -; GCN-NEXT: s_mul_hi_u32 s20, s17, s9 -; GCN-NEXT: s_mul_i32 s21, s17, s9 -; GCN-NEXT: s_mul_i32 s23, s18, s19 -; GCN-NEXT: s_mul_hi_u32 s9, s18, s9 -; GCN-NEXT: s_mul_hi_u32 s22, s18, s19 +; GCN-NEXT: s_add_u32 s21, s16, s17 +; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GCN-NEXT: s_addc_u32 s19, s19, s20 +; GCN-NEXT: s_mul_i32 s16, s9, s19 +; GCN-NEXT: s_mul_hi_u32 s17, s9, s21 +; GCN-NEXT: s_add_i32 s16, s17, s16 +; GCN-NEXT: s_mul_i32 s18, s18, s21 +; GCN-NEXT: s_add_i32 s16, s16, s18 +; GCN-NEXT: s_mul_i32 s9, s9, s21 +; GCN-NEXT: s_mul_hi_u32 s18, s19, s9 +; GCN-NEXT: s_mul_i32 s20, s19, s9 +; GCN-NEXT: s_mul_i32 s23, s21, s16 +; GCN-NEXT: s_mul_hi_u32 s9, s21, s9 +; GCN-NEXT: s_mul_hi_u32 s22, s21, s16 ; GCN-NEXT: s_add_u32 s9, s9, s23 ; GCN-NEXT: s_addc_u32 s22, 0, s22 -; GCN-NEXT: s_add_u32 s9, s9, s21 -; GCN-NEXT: s_mul_hi_u32 s16, s17, s19 -; GCN-NEXT: s_addc_u32 s9, s22, s20 -; GCN-NEXT: s_addc_u32 s16, s16, 0 -; GCN-NEXT: s_mul_i32 s19, s17, s19 -; GCN-NEXT: s_add_u32 s9, s9, s19 -; GCN-NEXT: s_addc_u32 s16, 0, s16 -; GCN-NEXT: s_add_u32 s9, s18, s9 -; GCN-NEXT: s_addc_u32 s20, s17, s16 +; GCN-NEXT: s_add_u32 s9, s9, s20 +; GCN-NEXT: s_mul_hi_u32 s17, s19, s16 +; GCN-NEXT: s_addc_u32 s9, s22, s18 +; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_mul_i32 s16, s19, s16 +; GCN-NEXT: s_add_u32 s9, s9, s16 +; GCN-NEXT: s_addc_u32 s18, 0, s17 +; GCN-NEXT: s_add_u32 s9, s21, s9 +; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GCN-NEXT: s_addc_u32 s20, s19, s18 ; GCN-NEXT: s_ashr_i32 s16, s11, 31 ; GCN-NEXT: s_add_u32 s18, s10, s16 ; GCN-NEXT: s_mov_b32 s17, s16 @@ -5292,9 +5344,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s9, s14, s9 ; GCN-NEXT: s_sub_u32 s9, s18, s9 ; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s18, s22, s15 ; GCN-NEXT: s_sub_u32 s24, s9, s14 ; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 ; GCN-NEXT: s_subb_u32 s25, s18, 0 ; GCN-NEXT: s_cmp_ge_u32 s25, s15 ; GCN-NEXT: s_cselect_b32 s26, -1, 0 @@ -5304,10 +5358,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s26, s27, s26 ; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 ; GCN-NEXT: s_subb_u32 s18, s18, s15 -; GCN-NEXT: s_sub_u32 s22, s24, s14 +; GCN-NEXT: s_sub_u32 s27, s24, s14 +; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 ; GCN-NEXT: s_subb_u32 s18, s18, 0 ; GCN-NEXT: s_cmp_lg_u32 s26, 0 -; GCN-NEXT: s_cselect_b32 s22, s22, s24 +; GCN-NEXT: s_cselect_b32 s22, s27, s24 ; GCN-NEXT: s_cselect_b32 s18, s18, s25 ; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s11, s19, s11 @@ -5364,7 +5420,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GCN-NEXT: s_sub_u32 s3, 0, s10 -; GCN-NEXT: s_subb_u32 s12, 0, s11 +; GCN-NEXT: s_subb_u32 s14, 0, s11 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -5373,52 +5429,56 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s13, v1 -; GCN-NEXT: v_readfirstlane_b32 s14, v0 -; GCN-NEXT: s_mul_i32 s15, s3, s13 -; GCN-NEXT: s_mul_hi_u32 s17, s3, s14 -; GCN-NEXT: s_mul_i32 s16, s12, s14 -; GCN-NEXT: s_add_i32 s15, s17, s15 -; GCN-NEXT: s_add_i32 s15, s15, s16 -; GCN-NEXT: s_mul_i32 s18, s3, s14 -; GCN-NEXT: s_mul_i32 s17, s14, s15 -; GCN-NEXT: s_mul_hi_u32 s19, s14, s18 -; GCN-NEXT: s_mul_hi_u32 s16, s14, s15 +; GCN-NEXT: v_readfirstlane_b32 s15, v1 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_mul_i32 s13, s3, s15 +; GCN-NEXT: s_mul_hi_u32 s17, s3, s12 +; GCN-NEXT: s_mul_i32 s16, s14, s12 +; GCN-NEXT: s_add_i32 s13, s17, s13 +; GCN-NEXT: s_add_i32 s13, s13, s16 +; GCN-NEXT: s_mul_i32 s18, s3, s12 +; GCN-NEXT: s_mul_i32 s17, s12, s13 +; GCN-NEXT: s_mul_hi_u32 s19, s12, s18 +; GCN-NEXT: s_mul_hi_u32 s16, s12, s13 ; GCN-NEXT: s_add_u32 s17, s19, s17 ; GCN-NEXT: s_addc_u32 s16, 0, s16 -; GCN-NEXT: s_mul_hi_u32 s20, s13, s18 -; GCN-NEXT: s_mul_i32 s18, s13, s18 +; GCN-NEXT: s_mul_hi_u32 s20, s15, s18 +; GCN-NEXT: s_mul_i32 s18, s15, s18 ; GCN-NEXT: s_add_u32 s17, s17, s18 -; GCN-NEXT: s_mul_hi_u32 s19, s13, s15 +; GCN-NEXT: s_mul_hi_u32 s19, s15, s13 ; GCN-NEXT: s_addc_u32 s16, s16, s20 ; GCN-NEXT: s_addc_u32 s17, s19, 0 -; GCN-NEXT: s_mul_i32 s15, s13, s15 -; GCN-NEXT: s_add_u32 s15, s16, s15 +; GCN-NEXT: s_mul_i32 s13, s15, s13 +; GCN-NEXT: s_add_u32 s13, s16, s13 ; GCN-NEXT: s_addc_u32 s16, 0, s17 -; GCN-NEXT: s_add_u32 s14, s14, s15 -; GCN-NEXT: s_addc_u32 s13, s13, s16 -; GCN-NEXT: s_mul_i32 s15, s3, s13 -; GCN-NEXT: s_mul_hi_u32 s16, s3, s14 -; GCN-NEXT: s_add_i32 s15, s16, s15 -; GCN-NEXT: s_mul_i32 s12, s12, s14 -; GCN-NEXT: s_add_i32 s15, s15, s12 -; GCN-NEXT: s_mul_i32 s3, s3, s14 -; GCN-NEXT: s_mul_hi_u32 s16, s13, s3 -; GCN-NEXT: s_mul_i32 s17, s13, s3 -; GCN-NEXT: s_mul_i32 s19, s14, s15 -; GCN-NEXT: s_mul_hi_u32 s3, s14, s3 -; GCN-NEXT: s_mul_hi_u32 s18, s14, s15 +; GCN-NEXT: s_add_u32 s17, s12, s13 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GCN-NEXT: s_addc_u32 s15, s15, s16 +; GCN-NEXT: s_mul_i32 s12, s3, s15 +; GCN-NEXT: s_mul_hi_u32 s13, s3, s17 +; GCN-NEXT: s_add_i32 s12, s13, s12 +; GCN-NEXT: s_mul_i32 s14, s14, s17 +; GCN-NEXT: s_add_i32 s12, s12, s14 +; GCN-NEXT: s_mul_i32 s3, s3, s17 +; GCN-NEXT: s_mul_hi_u32 s14, s15, s3 +; GCN-NEXT: s_mul_i32 s16, s15, s3 +; GCN-NEXT: s_mul_i32 s19, s17, s12 +; GCN-NEXT: s_mul_hi_u32 s3, s17, s3 +; GCN-NEXT: s_mul_hi_u32 s18, s17, s12 ; GCN-NEXT: s_add_u32 s3, s3, s19 ; GCN-NEXT: s_addc_u32 s18, 0, s18 -; GCN-NEXT: s_add_u32 s3, s3, s17 -; GCN-NEXT: s_mul_hi_u32 s12, s13, s15 -; GCN-NEXT: s_addc_u32 s3, s18, s16 -; GCN-NEXT: s_addc_u32 s12, s12, 0 -; GCN-NEXT: s_mul_i32 s15, s13, s15 -; GCN-NEXT: s_add_u32 s3, s3, s15 -; GCN-NEXT: s_addc_u32 s12, 0, s12 -; GCN-NEXT: s_add_u32 s3, s14, s3 -; GCN-NEXT: s_addc_u32 s16, s13, s12 +; GCN-NEXT: s_add_u32 s3, s3, s16 +; GCN-NEXT: s_mul_hi_u32 s13, s15, s12 +; GCN-NEXT: s_addc_u32 s3, s18, s14 +; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_mul_i32 s12, s15, s12 +; GCN-NEXT: s_add_u32 s3, s3, s12 +; GCN-NEXT: s_addc_u32 s14, 0, s13 +; GCN-NEXT: s_add_u32 s3, s17, s3 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GCN-NEXT: s_addc_u32 s16, s15, s14 ; GCN-NEXT: s_ashr_i32 s12, s5, 31 ; GCN-NEXT: s_add_u32 s14, s4, s12 ; GCN-NEXT: s_mov_b32 s13, s12 @@ -5447,9 +5507,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s3, s10, s3 ; GCN-NEXT: s_sub_u32 s3, s14, s3 ; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s14, s18, s11 ; GCN-NEXT: s_sub_u32 s20, s3, s10 ; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s21, s14, 0 ; GCN-NEXT: s_cmp_ge_u32 s21, s11 ; GCN-NEXT: s_cselect_b32 s22, -1, 0 @@ -5459,10 +5521,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s22, s23, s22 ; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s14, s14, s11 -; GCN-NEXT: s_sub_u32 s18, s20, s10 +; GCN-NEXT: s_sub_u32 s23, s20, s10 +; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s14, s14, 0 ; GCN-NEXT: s_cmp_lg_u32 s22, 0 -; GCN-NEXT: s_cselect_b32 s18, s18, s20 +; GCN-NEXT: s_cselect_b32 s18, s23, s20 ; GCN-NEXT: s_cselect_b32 s14, s14, s21 ; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s5, s15, s5 @@ -6235,9 +6299,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_readfirstlane_b32 s14, v8 ; TONGA-NEXT: s_sub_u32 s12, s12, s14 ; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, s7 ; TONGA-NEXT: s_sub_u32 s18, s12, s6 ; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s19, s1, 0 ; TONGA-NEXT: s_cmp_ge_u32 s19, s7 ; TONGA-NEXT: s_cselect_b32 s20, -1, 0 @@ -6247,10 +6313,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: s_cselect_b32 s20, s21, s20 ; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, s7 -; TONGA-NEXT: s_sub_u32 s16, s18, s6 +; TONGA-NEXT: s_sub_u32 s21, s18, s6 +; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, 0 ; TONGA-NEXT: s_cmp_lg_u32 s20, 0 -; TONGA-NEXT: s_cselect_b32 s16, s16, s18 +; TONGA-NEXT: s_cselect_b32 s16, s21, s18 ; TONGA-NEXT: s_cselect_b32 s1, s1, s19 ; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s3, s13, s3 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index ea9bb04..33b0a5d 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -51,9 +51,10 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_addc_u32 s13, 0, s14 ; GCN-NEXT: s_add_u32 s14, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 ; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s12, s12, s13 ; GCN-NEXT: s_mul_i32 s0, s10, s12 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 @@ -84,6 +85,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_add_u32 s11, s14, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s1, s12, s10 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 @@ -113,43 +115,46 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: v_readfirstlane_b32 s10, v0 ; GCN-NEXT: s_add_i32 s5, s10, s5 ; GCN-NEXT: s_mul_i32 s10, s9, s4 -; GCN-NEXT: s_add_i32 s12, s5, s10 -; GCN-NEXT: s_sub_i32 s10, s7, s12 +; GCN-NEXT: s_add_i32 s10, s5, s10 +; GCN-NEXT: s_sub_i32 s11, s7, s10 ; GCN-NEXT: s_mul_i32 s4, s8, s4 ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s11, s4, s5 -; GCN-NEXT: s_subb_u32 s13, s10, s9 -; GCN-NEXT: s_sub_u32 s14, s6, s8 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s15, s10, s11 -; GCN-NEXT: s_subb_u32 s15, s13, 0 -; GCN-NEXT: s_cmp_ge_u32 s15, s9 -; GCN-NEXT: s_cselect_b32 s16, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s8 -; GCN-NEXT: s_cselect_b32 s17, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s15, s9 -; GCN-NEXT: s_cselect_b32 s16, s17, s16 -; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s13, s13, s9 -; GCN-NEXT: s_sub_u32 s17, s14, s8 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s10, s13, 0 -; GCN-NEXT: s_cmp_lg_u32 s16, 0 -; GCN-NEXT: s_cselect_b32 s11, s17, s14 -; GCN-NEXT: s_cselect_b32 s10, s10, s15 +; GCN-NEXT: s_or_b32 s12, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s12, 0 +; GCN-NEXT: s_subb_u32 s11, s11, s9 +; GCN-NEXT: s_sub_u32 s13, s6, s8 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_subb_u32 s4, s7, s12 -; GCN-NEXT: s_cmp_ge_u32 s4, s9 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_subb_u32 s14, s11, 0 +; GCN-NEXT: s_cmp_ge_u32 s14, s9 ; GCN-NEXT: s_cselect_b32 s5, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s13, s8 +; GCN-NEXT: s_cselect_b32 s15, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s14, s9 +; GCN-NEXT: s_cselect_b32 s15, s15, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_subb_u32 s11, s11, s9 +; GCN-NEXT: s_sub_u32 s16, s13, s8 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_subb_u32 s4, s11, 0 +; GCN-NEXT: s_cmp_lg_u32 s15, 0 +; GCN-NEXT: s_cselect_b32 s5, s16, s13 +; GCN-NEXT: s_cselect_b32 s4, s4, s14 +; GCN-NEXT: s_cmp_lg_u32 s12, 0 +; GCN-NEXT: s_subb_u32 s7, s7, s10 +; GCN-NEXT: s_cmp_ge_u32 s7, s9 +; GCN-NEXT: s_cselect_b32 s10, -1, 0 ; GCN-NEXT: s_cmp_ge_u32 s6, s8 -; GCN-NEXT: s_cselect_b32 s7, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s4, s9 -; GCN-NEXT: s_cselect_b32 s5, s7, s5 -; GCN-NEXT: s_cmp_lg_u32 s5, 0 -; GCN-NEXT: s_cselect_b32 s4, s10, s4 -; GCN-NEXT: s_cselect_b32 s5, s11, s6 +; GCN-NEXT: s_cselect_b32 s8, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s7, s9 +; GCN-NEXT: s_cselect_b32 s8, s8, s10 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cselect_b32 s4, s4, s7 +; GCN-NEXT: s_cselect_b32 s5, s5, s6 ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -182,6 +187,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_or_b32 s8, s8, s9 +; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-IR-NEXT: s_addc_u32 s8, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -215,6 +221,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_add_u32 s16, s16, 1 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_or_b32 s18, s18, s19 +; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0 ; GCN-IR-NEXT: s_addc_u32 s17, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] @@ -1009,9 +1016,10 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_addc_u32 s13, 0, s14 ; GCN-NEXT: s_add_u32 s14, s8, s9 ; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 ; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: s_addc_u32 s12, s12, s13 ; GCN-NEXT: s_mul_i32 s8, s10, s12 ; GCN-NEXT: v_readfirstlane_b32 s9, v0 @@ -1042,6 +1050,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_add_u32 s11, s14, s8 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: s_addc_u32 s10, s12, s10 ; GCN-NEXT: s_ashr_i32 s8, s7, 31 ; GCN-NEXT: s_add_u32 s6, s6, s8 @@ -1074,43 +1083,46 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_readfirstlane_b32 s12, v0 ; GCN-NEXT: s_add_i32 s11, s12, s11 ; GCN-NEXT: s_mul_i32 s12, s5, s10 -; GCN-NEXT: s_add_i32 s14, s11, s12 -; GCN-NEXT: s_sub_i32 s12, s7, s14 +; GCN-NEXT: s_add_i32 s12, s11, s12 +; GCN-NEXT: s_sub_i32 s13, s7, s12 ; GCN-NEXT: s_mul_i32 s10, s4, s10 ; GCN-NEXT: s_sub_u32 s6, s6, s10 ; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s13, s10, s11 -; GCN-NEXT: s_subb_u32 s15, s12, s5 -; GCN-NEXT: s_sub_u32 s16, s6, s4 -; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-NEXT: s_or_b32 s17, s12, s13 -; GCN-NEXT: s_subb_u32 s17, s15, 0 -; GCN-NEXT: s_cmp_ge_u32 s17, s5 -; GCN-NEXT: s_cselect_b32 s18, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s16, s4 -; GCN-NEXT: s_cselect_b32 s19, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s17, s5 -; GCN-NEXT: s_cselect_b32 s18, s19, s18 -; GCN-NEXT: s_or_b32 s12, s12, s13 -; GCN-NEXT: s_subb_u32 s15, s15, s5 -; GCN-NEXT: s_sub_u32 s19, s16, s4 -; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-NEXT: s_or_b32 s12, s12, s13 -; GCN-NEXT: s_subb_u32 s12, s15, 0 -; GCN-NEXT: s_cmp_lg_u32 s18, 0 -; GCN-NEXT: s_cselect_b32 s13, s19, s16 -; GCN-NEXT: s_cselect_b32 s12, s12, s17 +; GCN-NEXT: s_or_b32 s14, s10, s11 +; GCN-NEXT: s_cmp_lg_u32 s14, 0 +; GCN-NEXT: s_subb_u32 s13, s13, s5 +; GCN-NEXT: s_sub_u32 s15, s6, s4 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_or_b32 s10, s10, s11 +; GCN-NEXT: s_cmp_lg_u32 s10, 0 +; GCN-NEXT: s_subb_u32 s16, s13, 0 +; GCN-NEXT: s_cmp_ge_u32 s16, s5 +; GCN-NEXT: s_cselect_b32 s11, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s15, s4 +; GCN-NEXT: s_cselect_b32 s17, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s16, s5 +; GCN-NEXT: s_cselect_b32 s17, s17, s11 +; GCN-NEXT: s_cmp_lg_u32 s10, 0 +; GCN-NEXT: s_subb_u32 s13, s13, s5 +; GCN-NEXT: s_sub_u32 s18, s15, s4 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s7, s7, s14 +; GCN-NEXT: s_cmp_lg_u32 s10, 0 +; GCN-NEXT: s_subb_u32 s10, s13, 0 +; GCN-NEXT: s_cmp_lg_u32 s17, 0 +; GCN-NEXT: s_cselect_b32 s11, s18, s15 +; GCN-NEXT: s_cselect_b32 s10, s10, s16 +; GCN-NEXT: s_cmp_lg_u32 s14, 0 +; GCN-NEXT: s_subb_u32 s7, s7, s12 ; GCN-NEXT: s_cmp_ge_u32 s7, s5 -; GCN-NEXT: s_cselect_b32 s10, -1, 0 +; GCN-NEXT: s_cselect_b32 s12, -1, 0 ; GCN-NEXT: s_cmp_ge_u32 s6, s4 ; GCN-NEXT: s_cselect_b32 s4, -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s7, s5 -; GCN-NEXT: s_cselect_b32 s4, s4, s10 +; GCN-NEXT: s_cselect_b32 s4, s4, s12 ; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_cselect_b32 s5, s12, s7 -; GCN-NEXT: s_cselect_b32 s4, s13, s6 +; GCN-NEXT: s_cselect_b32 s5, s10, s7 +; GCN-NEXT: s_cselect_b32 s4, s11, s6 ; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] ; GCN-NEXT: s_sub_u32 s4, s4, s8 ; GCN-NEXT: s_subb_u32 s5, s5, s8 @@ -1158,6 +1170,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_add_u32 s16, s14, 1 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_or_b32 s10, s10, s11 +; GCN-IR-NEXT: s_cmp_lg_u32 s10, 0 ; GCN-IR-NEXT: s_addc_u32 s10, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s14, 63, s14 @@ -1191,6 +1204,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_add_u32 s18, s18, 1 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_or_b32 s20, s20, s21 +; GCN-IR-NEXT: s_cmp_lg_u32 s20, 0 ; GCN-IR-NEXT: s_addc_u32 s19, s19, 0 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3] @@ -1355,9 +1369,10 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_addc_u32 s10, 0, s11 ; GCN-NEXT: s_add_u32 s11, s6, s7 ; GCN-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-NEXT: s_or_b32 s6, s6, s7 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_addc_u32 s9, s9, s10 ; GCN-NEXT: s_mul_i32 s6, s2, s9 ; GCN-NEXT: v_readfirstlane_b32 s7, v0 @@ -1388,6 +1403,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_add_u32 s2, s11, s2 ; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-NEXT: s_or_b32 s6, s6, s7 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_addc_u32 s6, s9, s8 ; GCN-NEXT: v_mul_hi_u32 v1, s2, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s6, 24 @@ -1402,42 +1418,45 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_mul_i32 s7, s5, s6 ; GCN-NEXT: s_mul_i32 s6, s4, s6 ; GCN-NEXT: v_readfirstlane_b32 s8, v0 -; GCN-NEXT: s_add_i32 s10, s8, s7 -; GCN-NEXT: s_sub_i32 s8, 0, s10 -; GCN-NEXT: s_sub_u32 s11, 24, s6 +; GCN-NEXT: s_add_i32 s8, s8, s7 +; GCN-NEXT: s_sub_i32 s9, 0, s8 +; GCN-NEXT: s_sub_u32 s10, 24, s6 +; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN-NEXT: s_or_b32 s11, s6, s7 +; GCN-NEXT: s_cmp_lg_u32 s11, 0 +; GCN-NEXT: s_subb_u32 s9, s9, s5 +; GCN-NEXT: s_sub_u32 s12, s10, s4 ; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-NEXT: s_or_b32 s9, s6, s7 -; GCN-NEXT: s_subb_u32 s12, s8, s5 -; GCN-NEXT: s_sub_u32 s13, s11, s4 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s14, s8, s9 -; GCN-NEXT: s_subb_u32 s14, s12, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s5 -; GCN-NEXT: s_cselect_b32 s15, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s13, s4 -; GCN-NEXT: s_cselect_b32 s16, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s14, s5 -; GCN-NEXT: s_cselect_b32 s15, s16, s15 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s12, s12, s5 -; GCN-NEXT: s_sub_u32 s16, s13, s4 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s8, s12, 0 -; GCN-NEXT: s_cmp_lg_u32 s15, 0 -; GCN-NEXT: s_cselect_b32 s9, s16, s13 -; GCN-NEXT: s_cselect_b32 s8, s8, s14 ; GCN-NEXT: s_or_b32 s6, s6, s7 -; GCN-NEXT: s_subb_u32 s6, 0, s10 -; GCN-NEXT: s_cmp_ge_u32 s6, s5 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_subb_u32 s13, s9, 0 +; GCN-NEXT: s_cmp_ge_u32 s13, s5 ; GCN-NEXT: s_cselect_b32 s7, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s11, s4 +; GCN-NEXT: s_cmp_ge_u32 s12, s4 +; GCN-NEXT: s_cselect_b32 s14, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s13, s5 +; GCN-NEXT: s_cselect_b32 s14, s14, s7 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_subb_u32 s9, s9, s5 +; GCN-NEXT: s_sub_u32 s15, s12, s4 +; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN-NEXT: s_or_b32 s6, s6, s7 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_subb_u32 s6, s9, 0 +; GCN-NEXT: s_cmp_lg_u32 s14, 0 +; GCN-NEXT: s_cselect_b32 s7, s15, s12 +; GCN-NEXT: s_cselect_b32 s6, s6, s13 +; GCN-NEXT: s_cmp_lg_u32 s11, 0 +; GCN-NEXT: s_subb_u32 s8, 0, s8 +; GCN-NEXT: s_cmp_ge_u32 s8, s5 +; GCN-NEXT: s_cselect_b32 s9, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s10, s4 ; GCN-NEXT: s_cselect_b32 s4, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s6, s5 -; GCN-NEXT: s_cselect_b32 s4, s4, s7 +; GCN-NEXT: s_cmp_eq_u32 s8, s5 +; GCN-NEXT: s_cselect_b32 s4, s4, s9 ; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_cselect_b32 s4, s8, s6 -; GCN-NEXT: s_cselect_b32 s5, s9, s11 +; GCN-NEXT: s_cselect_b32 s4, s6, s8 +; GCN-NEXT: s_cselect_b32 s5, s7, s10 ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1470,6 +1489,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s8, s2, 1 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_or_b32 s9, s10, s11 +; GCN-IR-NEXT: s_cmp_lg_u32 s9, 0 ; GCN-IR-NEXT: s_addc_u32 s3, s3, 0 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s2, 63, s2 @@ -1502,6 +1522,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_or_b32 s16, s16, s17 +; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll index bdd22f25..bb5918b2 100644 --- a/llvm/test/CodeGen/AMDGPU/uaddo.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll @@ -18,6 +18,7 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: s_or_b32 s0, s0, s1 +; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_addc_u32 s3, s3, s9 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -34,9 +35,11 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s2, s2, s4 -; VI-NEXT: s_addc_u32 s3, s3, s5 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 +; VI-NEXT: s_addc_u32 s3, s3, s5 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2 @@ -50,12 +53,14 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s4, s2, s6 -; GFX9-NEXT: s_addc_u32 s5, s3, s7 +; GFX9-NEXT: s_add_u32 s6, s2, s6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_addc_u32 s4, s3, s7 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm @@ -68,6 +73,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, s6 +; GFX10-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_addc_u32 s3, s3, s7 ; GFX10-NEXT: s_cselect_b32 s4, -1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 @@ -84,12 +91,14 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s2, s2, s4 +; GFX11-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_cmp_lg_u32 s4, 0 ; GFX11-NEXT: s_addc_u32 s3, s3, s5 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm @@ -435,6 +444,7 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_add_u32 s4, s4, s6 ; SI-NEXT: s_cselect_b64 s[12:13], -1, 0 ; SI-NEXT: s_or_b32 s6, s12, s13 +; SI-NEXT: s_cmp_lg_u32 s6, 0 ; SI-NEXT: s_addc_u32 s5, s5, s7 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 @@ -455,14 +465,16 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_add_u32 s2, s4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_add_u32 s0, s4, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_addc_u32 s1, s5, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 +; VI-NEXT: s_addc_u32 s0, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v5, s0 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -474,10 +486,12 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s12, s14 -; GFX9-NEXT: s_addc_u32 s1, s13, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_add_u32 s2, s12, s14 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_addc_u32 s0, s13, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -490,8 +504,10 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s12, s14 -; GFX10-NEXT: s_addc_u32 s1, s13, s15 +; GFX10-NEXT: s_cselect_b32 s1, -1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10-NEXT: s_addc_u32 s1, s13, s15 ; GFX10-NEXT: s_cselect_b32 s0, -1, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -504,8 +520,10 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s4, s4, s6 -; GFX11-NEXT: s_addc_u32 s5, s5, s7 +; GFX11-NEXT: s_cselect_b32 s6, -1, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_cmp_lg_u32 s6, 0 +; GFX11-NEXT: s_addc_u32 s5, s5, s7 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index fd461ac..41199b0 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -148,6 +148,7 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_or_b32 s8, s8, s9 +; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-IR-NEXT: s_addc_u32 s8, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -181,6 +182,7 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_or_b32 s16, s16, s17 +; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[2:3], s[4:5] @@ -829,9 +831,10 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_addc_u32 s10, 0, s11 ; GCN-NEXT: s_add_u32 s11, s4, s5 ; GCN-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 ; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_addc_u32 s9, s9, s10 ; GCN-NEXT: s_mul_i32 s4, s6, s9 ; GCN-NEXT: v_readfirstlane_b32 s5, v0 @@ -862,6 +865,7 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_add_u32 s8, s11, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_addc_u32 s4, s9, s6 ; GCN-NEXT: v_mul_hi_u32 v1, s8, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s4, 24 @@ -870,50 +874,52 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_readfirstlane_b32 s8, v1 ; GCN-NEXT: v_readfirstlane_b32 s5, v0 ; GCN-NEXT: s_add_u32 s4, s8, s4 -; GCN-NEXT: s_addc_u32 s10, 0, s5 -; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: s_addc_u32 s8, 0, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_mul_i32 s0, s3, s10 +; GCN-NEXT: s_mul_i32 s0, s3, s8 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_add_i32 s11, s1, s0 -; GCN-NEXT: s_sub_i32 s8, 0, s11 -; GCN-NEXT: s_mul_i32 s0, s2, s10 -; GCN-NEXT: s_sub_u32 s12, 24, s0 +; GCN-NEXT: s_add_i32 s9, s1, s0 +; GCN-NEXT: s_sub_i32 s10, 0, s9 +; GCN-NEXT: s_mul_i32 s0, s2, s8 +; GCN-NEXT: s_sub_u32 s11, 24, s0 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_or_b32 s12, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s12, 0 +; GCN-NEXT: s_subb_u32 s10, s10, s3 +; GCN-NEXT: s_sub_u32 s13, s11, s2 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s9, s0, s1 -; GCN-NEXT: s_subb_u32 s13, s8, s3 -; GCN-NEXT: s_sub_u32 s14, s12, s2 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s8, s13, 0 -; GCN-NEXT: s_cmp_ge_u32 s8, s3 -; GCN-NEXT: s_cselect_b32 s9, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s2 -; GCN-NEXT: s_cselect_b32 s13, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s8, s3 -; GCN-NEXT: s_cselect_b32 s8, s13, s9 -; GCN-NEXT: s_add_u32 s9, s10, 1 -; GCN-NEXT: s_addc_u32 s13, 0, 0 -; GCN-NEXT: s_add_u32 s14, s10, 2 -; GCN-NEXT: s_addc_u32 s15, 0, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: s_cselect_b32 s8, s14, s9 -; GCN-NEXT: s_cselect_b32 s9, s15, s13 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_subb_u32 s0, 0, s11 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_subb_u32 s0, s10, 0 ; GCN-NEXT: s_cmp_ge_u32 s0, s3 ; GCN-NEXT: s_cselect_b32 s1, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s12, s2 -; GCN-NEXT: s_cselect_b32 s2, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s13, s2 +; GCN-NEXT: s_cselect_b32 s10, -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s0, s3 -; GCN-NEXT: s_cselect_b32 s0, s2, s1 +; GCN-NEXT: s_cselect_b32 s0, s10, s1 +; GCN-NEXT: s_add_u32 s1, s8, 1 +; GCN-NEXT: s_addc_u32 s10, 0, 0 +; GCN-NEXT: s_add_u32 s13, s8, 2 +; GCN-NEXT: s_addc_u32 s14, 0, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_cselect_b32 s0, s9, 0 -; GCN-NEXT: s_cselect_b32 s1, s8, s10 -; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: s_cselect_b32 s0, s13, s1 +; GCN-NEXT: s_cselect_b32 s1, s14, s10 +; GCN-NEXT: s_cmp_lg_u32 s12, 0 +; GCN-NEXT: s_subb_u32 s9, 0, s9 +; GCN-NEXT: s_cmp_ge_u32 s9, s3 +; GCN-NEXT: s_cselect_b32 s10, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s11, s2 +; GCN-NEXT: s_cselect_b32 s2, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s9, s3 +; GCN-NEXT: s_cselect_b32 s2, s2, s10 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 +; GCN-NEXT: s_cselect_b32 s1, s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s0, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; @@ -939,6 +945,7 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s10, s8, 1 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_or_b32 s6, s6, s7 +; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -971,6 +978,7 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_or_b32 s16, s16, s17 +; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5] @@ -1309,6 +1317,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s11, s8, 1 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_or_b32 s6, s6, s7 +; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -1338,6 +1347,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 ; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GCN-IR-NEXT: s_or_b32 s12, s12, s13 +; GCN-IR-NEXT: s_cmp_lg_u32 s12, 0 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index 137dc1f..cdcc914 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -51,9 +51,10 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: s_addc_u32 s13, 0, s14 ; GCN-NEXT: s_add_u32 s14, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 ; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s12, s12, s13 ; GCN-NEXT: s_mul_i32 s0, s10, s12 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 @@ -84,6 +85,7 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: s_add_u32 s11, s14, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s1, s12, s10 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 @@ -113,43 +115,46 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: v_readfirstlane_b32 s10, v0 ; GCN-NEXT: s_add_i32 s5, s10, s5 ; GCN-NEXT: s_mul_i32 s10, s9, s4 -; GCN-NEXT: s_add_i32 s12, s5, s10 -; GCN-NEXT: s_sub_i32 s10, s7, s12 +; GCN-NEXT: s_add_i32 s10, s5, s10 +; GCN-NEXT: s_sub_i32 s11, s7, s10 ; GCN-NEXT: s_mul_i32 s4, s8, s4 ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s11, s4, s5 -; GCN-NEXT: s_subb_u32 s13, s10, s9 -; GCN-NEXT: s_sub_u32 s14, s6, s8 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s15, s10, s11 -; GCN-NEXT: s_subb_u32 s15, s13, 0 -; GCN-NEXT: s_cmp_ge_u32 s15, s9 -; GCN-NEXT: s_cselect_b32 s16, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s8 -; GCN-NEXT: s_cselect_b32 s17, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s15, s9 -; GCN-NEXT: s_cselect_b32 s16, s17, s16 -; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s13, s13, s9 -; GCN-NEXT: s_sub_u32 s17, s14, s8 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s10, s13, 0 -; GCN-NEXT: s_cmp_lg_u32 s16, 0 -; GCN-NEXT: s_cselect_b32 s11, s17, s14 -; GCN-NEXT: s_cselect_b32 s10, s10, s15 +; GCN-NEXT: s_or_b32 s12, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s12, 0 +; GCN-NEXT: s_subb_u32 s11, s11, s9 +; GCN-NEXT: s_sub_u32 s13, s6, s8 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_subb_u32 s4, s7, s12 -; GCN-NEXT: s_cmp_ge_u32 s4, s9 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_subb_u32 s14, s11, 0 +; GCN-NEXT: s_cmp_ge_u32 s14, s9 ; GCN-NEXT: s_cselect_b32 s5, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s13, s8 +; GCN-NEXT: s_cselect_b32 s15, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s14, s9 +; GCN-NEXT: s_cselect_b32 s15, s15, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_subb_u32 s11, s11, s9 +; GCN-NEXT: s_sub_u32 s16, s13, s8 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_subb_u32 s4, s11, 0 +; GCN-NEXT: s_cmp_lg_u32 s15, 0 +; GCN-NEXT: s_cselect_b32 s5, s16, s13 +; GCN-NEXT: s_cselect_b32 s4, s4, s14 +; GCN-NEXT: s_cmp_lg_u32 s12, 0 +; GCN-NEXT: s_subb_u32 s7, s7, s10 +; GCN-NEXT: s_cmp_ge_u32 s7, s9 +; GCN-NEXT: s_cselect_b32 s10, -1, 0 ; GCN-NEXT: s_cmp_ge_u32 s6, s8 -; GCN-NEXT: s_cselect_b32 s7, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s4, s9 -; GCN-NEXT: s_cselect_b32 s5, s7, s5 -; GCN-NEXT: s_cmp_lg_u32 s5, 0 -; GCN-NEXT: s_cselect_b32 s4, s10, s4 -; GCN-NEXT: s_cselect_b32 s5, s11, s6 +; GCN-NEXT: s_cselect_b32 s8, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s7, s9 +; GCN-NEXT: s_cselect_b32 s8, s8, s10 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cselect_b32 s4, s4, s7 +; GCN-NEXT: s_cselect_b32 s5, s5, s6 ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -182,6 +187,7 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_or_b32 s8, s8, s9 +; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-IR-NEXT: s_addc_u32 s8, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -215,6 +221,7 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_add_u32 s16, s16, 1 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_or_b32 s18, s18, s19 +; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0 ; GCN-IR-NEXT: s_addc_u32 s17, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] @@ -846,9 +853,10 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_addc_u32 s10, 0, s11 ; GCN-NEXT: s_add_u32 s11, s4, s5 ; GCN-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 ; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_addc_u32 s9, s9, s10 ; GCN-NEXT: s_mul_i32 s4, s6, s9 ; GCN-NEXT: v_readfirstlane_b32 s5, v0 @@ -879,6 +887,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_add_u32 s8, s11, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_addc_u32 s4, s9, s6 ; GCN-NEXT: v_mul_hi_u32 v1, s8, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s4, 24 @@ -894,43 +903,46 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_mul_i32 s0, s3, s8 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_add_i32 s10, s1, s0 -; GCN-NEXT: s_sub_i32 s9, 0, s10 +; GCN-NEXT: s_add_i32 s9, s1, s0 +; GCN-NEXT: s_sub_i32 s10, 0, s9 ; GCN-NEXT: s_mul_i32 s0, s2, s8 -; GCN-NEXT: s_sub_u32 s11, 24, s0 +; GCN-NEXT: s_sub_u32 s8, 24, s0 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_or_b32 s11, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s11, 0 +; GCN-NEXT: s_subb_u32 s10, s10, s3 +; GCN-NEXT: s_sub_u32 s12, s8, s2 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s8, s0, s1 -; GCN-NEXT: s_subb_u32 s12, s9, s3 -; GCN-NEXT: s_sub_u32 s13, s11, s2 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s14, s8, s9 -; GCN-NEXT: s_subb_u32 s14, s12, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s3 -; GCN-NEXT: s_cselect_b32 s15, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s13, s2 -; GCN-NEXT: s_cselect_b32 s16, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s14, s3 -; GCN-NEXT: s_cselect_b32 s15, s16, s15 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s12, s12, s3 -; GCN-NEXT: s_sub_u32 s16, s13, s2 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s8, s12, 0 -; GCN-NEXT: s_cmp_lg_u32 s15, 0 -; GCN-NEXT: s_cselect_b32 s9, s16, s13 -; GCN-NEXT: s_cselect_b32 s8, s8, s14 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_subb_u32 s0, 0, s10 -; GCN-NEXT: s_cmp_ge_u32 s0, s3 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_subb_u32 s13, s10, 0 +; GCN-NEXT: s_cmp_ge_u32 s13, s3 ; GCN-NEXT: s_cselect_b32 s1, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s11, s2 +; GCN-NEXT: s_cmp_ge_u32 s12, s2 +; GCN-NEXT: s_cselect_b32 s14, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s13, s3 +; GCN-NEXT: s_cselect_b32 s14, s14, s1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_subb_u32 s10, s10, s3 +; GCN-NEXT: s_sub_u32 s15, s12, s2 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_subb_u32 s0, s10, 0 +; GCN-NEXT: s_cmp_lg_u32 s14, 0 +; GCN-NEXT: s_cselect_b32 s1, s15, s12 +; GCN-NEXT: s_cselect_b32 s0, s0, s13 +; GCN-NEXT: s_cmp_lg_u32 s11, 0 +; GCN-NEXT: s_subb_u32 s9, 0, s9 +; GCN-NEXT: s_cmp_ge_u32 s9, s3 +; GCN-NEXT: s_cselect_b32 s10, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s8, s2 ; GCN-NEXT: s_cselect_b32 s2, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s0, s3 -; GCN-NEXT: s_cselect_b32 s1, s2, s1 -; GCN-NEXT: s_cmp_lg_u32 s1, 0 -; GCN-NEXT: s_cselect_b32 s0, s8, s0 -; GCN-NEXT: s_cselect_b32 s1, s9, s11 +; GCN-NEXT: s_cmp_eq_u32 s9, s3 +; GCN-NEXT: s_cselect_b32 s2, s2, s10 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 +; GCN-NEXT: s_cselect_b32 s0, s0, s9 +; GCN-NEXT: s_cselect_b32 s1, s1, s8 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -958,6 +970,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s10, s8, 1 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_or_b32 s6, s6, s7 +; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -990,6 +1003,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_or_b32 s16, s16, s17 +; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5] @@ -1079,6 +1093,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s11, s8, 1 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_or_b32 s6, s6, s7 +; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -1108,6 +1123,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s12, s12, 1 ; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0 ; GCN-IR-NEXT: s_or_b32 s14, s14, s15 +; GCN-IR-NEXT: s_cmp_lg_u32 s14, 0 ; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll index e8db647..d67a7b1 100644 --- a/llvm/test/CodeGen/AMDGPU/usubo.ll +++ b/llvm/test/CodeGen/AMDGPU/usubo.ll @@ -18,6 +18,7 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: s_or_b32 s0, s0, s1 +; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_subb_u32 s3, s3, s9 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -34,9 +35,11 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_sub_u32 s2, s2, s4 -; VI-NEXT: s_subb_u32 s3, s3, s5 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 +; VI-NEXT: s_subb_u32 s3, s3, s5 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2 @@ -50,12 +53,14 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s4, s2, s6 -; GFX9-NEXT: s_subb_u32 s5, s3, s7 +; GFX9-NEXT: s_sub_u32 s6, s2, s6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_subb_u32 s4, s3, s7 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm @@ -68,6 +73,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sub_u32 s2, s2, s6 +; GFX10-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_subb_u32 s3, s3, s7 ; GFX10-NEXT: s_cselect_b32 s4, -1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 @@ -84,12 +91,14 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s2, s2, s4 +; GFX11-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_cmp_lg_u32 s4, 0 ; GFX11-NEXT: s_subb_u32 s3, s3, s5 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm @@ -434,6 +443,7 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_sub_u32 s4, s4, s6 ; SI-NEXT: s_cselect_b64 s[12:13], -1, 0 ; SI-NEXT: s_or_b32 s6, s12, s13 +; SI-NEXT: s_cmp_lg_u32 s6, 0 ; SI-NEXT: s_subb_u32 s5, s5, s7 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 @@ -454,14 +464,16 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_sub_u32 s2, s4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_sub_u32 s0, s4, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_subb_u32 s1, s5, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 +; VI-NEXT: s_subb_u32 s0, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v5, s0 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -473,10 +485,12 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s0, s12, s14 -; GFX9-NEXT: s_subb_u32 s1, s13, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_sub_u32 s2, s12, s14 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_subb_u32 s0, s13, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -489,8 +503,10 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sub_u32 s0, s12, s14 -; GFX10-NEXT: s_subb_u32 s1, s13, s15 +; GFX10-NEXT: s_cselect_b32 s1, -1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10-NEXT: s_subb_u32 s1, s13, s15 ; GFX10-NEXT: s_cselect_b32 s0, -1, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -503,8 +519,10 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s4, s4, s6 -; GFX11-NEXT: s_subb_u32 s5, s5, s7 +; GFX11-NEXT: s_cselect_b32 s6, -1, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_cmp_lg_u32 s6, 0 +; GFX11-NEXT: s_subb_u32 s5, s5, s7 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 28c6b40..75db387 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -774,40 +774,44 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_add_u32 s11, s12, s11 ; GFX1032-NEXT: s_addc_u32 s12, 0, s13 ; GFX1032-NEXT: s_add_u32 s8, s8, s11 +; GFX1032-NEXT: s_cselect_b32 s11, -1, 0 +; GFX1032-NEXT: s_mul_hi_u32 s13, s9, s8 +; GFX1032-NEXT: s_cmp_lg_u32 s11, 0 +; GFX1032-NEXT: s_mul_i32 s11, s9, s8 ; GFX1032-NEXT: s_addc_u32 s5, s5, s12 -; GFX1032-NEXT: s_mul_hi_u32 s11, s9, s8 -; GFX1032-NEXT: s_mul_i32 s12, s9, s8 -; GFX1032-NEXT: s_mul_i32 s9, s9, s5 ; GFX1032-NEXT: s_mul_i32 s10, s10, s8 -; GFX1032-NEXT: s_add_i32 s9, s11, s9 -; GFX1032-NEXT: s_mul_i32 s11, s5, s12 +; GFX1032-NEXT: s_mul_i32 s9, s9, s5 +; GFX1032-NEXT: s_mul_hi_u32 s12, s8, s11 +; GFX1032-NEXT: s_add_i32 s9, s13, s9 +; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s11 ; GFX1032-NEXT: s_add_i32 s9, s9, s10 -; GFX1032-NEXT: s_mul_hi_u32 s10, s8, s12 +; GFX1032-NEXT: s_mul_i32 s10, s5, s11 ; GFX1032-NEXT: s_mul_i32 s15, s8, s9 ; GFX1032-NEXT: s_mul_hi_u32 s14, s8, s9 -; GFX1032-NEXT: s_add_u32 s10, s10, s15 -; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s12 +; GFX1032-NEXT: s_add_u32 s12, s12, s15 ; GFX1032-NEXT: s_addc_u32 s14, 0, s14 -; GFX1032-NEXT: s_mul_hi_u32 s12, s5, s9 -; GFX1032-NEXT: s_add_u32 s10, s10, s11 +; GFX1032-NEXT: s_mul_hi_u32 s11, s5, s9 +; GFX1032-NEXT: s_add_u32 s10, s12, s10 ; GFX1032-NEXT: s_mul_i32 s9, s5, s9 ; GFX1032-NEXT: s_addc_u32 s10, s14, s13 -; GFX1032-NEXT: s_addc_u32 s11, s12, 0 +; GFX1032-NEXT: s_addc_u32 s11, s11, 0 ; GFX1032-NEXT: s_add_u32 s9, s10, s9 ; GFX1032-NEXT: s_addc_u32 s10, 0, s11 ; GFX1032-NEXT: s_add_u32 s8, s8, s9 +; GFX1032-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s8 +; GFX1032-NEXT: s_cmp_lg_u32 s9, 0 +; GFX1032-NEXT: s_mul_hi_u32 s9, s3, s8 ; GFX1032-NEXT: s_addc_u32 s5, s5, s10 -; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s8 -; GFX1032-NEXT: s_mul_i32 s12, s2, s5 -; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s5 -; GFX1032-NEXT: s_mul_hi_u32 s10, s3, s8 ; GFX1032-NEXT: s_mul_i32 s8, s3, s8 -; GFX1032-NEXT: s_add_u32 s9, s9, s12 -; GFX1032-NEXT: s_addc_u32 s11, 0, s11 +; GFX1032-NEXT: s_mul_i32 s12, s2, s5 +; GFX1032-NEXT: s_mul_hi_u32 s10, s2, s5 +; GFX1032-NEXT: s_add_u32 s11, s11, s12 +; GFX1032-NEXT: s_addc_u32 s10, 0, s10 ; GFX1032-NEXT: s_mul_hi_u32 s13, s3, s5 -; GFX1032-NEXT: s_add_u32 s8, s9, s8 +; GFX1032-NEXT: s_add_u32 s8, s11, s8 ; GFX1032-NEXT: s_mul_i32 s5, s3, s5 -; GFX1032-NEXT: s_addc_u32 s8, s11, s10 +; GFX1032-NEXT: s_addc_u32 s8, s10, s9 ; GFX1032-NEXT: s_addc_u32 s9, s13, 0 ; GFX1032-NEXT: s_add_u32 s5, s8, s5 ; GFX1032-NEXT: s_addc_u32 s8, 0, s9 @@ -820,8 +824,11 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_sub_i32 s11, s3, s9 ; GFX1032-NEXT: s_sub_u32 s10, s2, s10 ; GFX1032-NEXT: s_cselect_b32 s12, -1, 0 +; GFX1032-NEXT: s_cmp_lg_u32 s12, 0 ; GFX1032-NEXT: s_subb_u32 s11, s11, s1 ; GFX1032-NEXT: s_sub_u32 s13, s10, s0 +; GFX1032-NEXT: s_cselect_b32 s14, -1, 0 +; GFX1032-NEXT: s_cmp_lg_u32 s14, 0 ; GFX1032-NEXT: s_subb_u32 s11, s11, 0 ; GFX1032-NEXT: s_cmp_ge_u32 s11, s1 ; GFX1032-NEXT: s_cselect_b32 s14, -1, 0 @@ -894,8 +901,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX1064-NEXT: s_sub_u32 s8, 0, s0 -; GFX1064-NEXT: s_subb_u32 s9, 0, s1 +; GFX1064-NEXT: s_sub_u32 s9, 0, s0 +; GFX1064-NEXT: s_subb_u32 s10, 0, s1 ; GFX1064-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX1064-NEXT: v_rcp_f32_e32 v0, v0 ; GFX1064-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -904,102 +911,109 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX1064-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1064-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1064-NEXT: v_readfirstlane_b32 s5, v0 -; GFX1064-NEXT: s_mul_i32 s10, s8, s4 -; GFX1064-NEXT: s_mul_hi_u32 s12, s8, s5 -; GFX1064-NEXT: s_mul_i32 s11, s9, s5 -; GFX1064-NEXT: s_add_i32 s10, s12, s10 -; GFX1064-NEXT: s_mul_i32 s13, s8, s5 -; GFX1064-NEXT: s_add_i32 s10, s10, s11 -; GFX1064-NEXT: s_mul_hi_u32 s12, s5, s13 -; GFX1064-NEXT: s_mul_i32 s15, s5, s10 -; GFX1064-NEXT: s_mul_hi_u32 s14, s4, s13 -; GFX1064-NEXT: s_mul_i32 s11, s4, s13 -; GFX1064-NEXT: s_mul_hi_u32 s13, s5, s10 +; GFX1064-NEXT: v_readfirstlane_b32 s8, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1064-NEXT: s_mul_i32 s5, s9, s8 +; GFX1064-NEXT: s_mul_hi_u32 s12, s9, s4 +; GFX1064-NEXT: s_mul_i32 s11, s10, s4 +; GFX1064-NEXT: s_add_i32 s5, s12, s5 +; GFX1064-NEXT: s_mul_i32 s13, s9, s4 +; GFX1064-NEXT: s_add_i32 s5, s5, s11 +; GFX1064-NEXT: s_mul_hi_u32 s12, s4, s13 +; GFX1064-NEXT: s_mul_i32 s15, s4, s5 +; GFX1064-NEXT: s_mul_hi_u32 s14, s8, s13 +; GFX1064-NEXT: s_mul_i32 s11, s8, s13 +; GFX1064-NEXT: s_mul_hi_u32 s13, s4, s5 ; GFX1064-NEXT: s_add_u32 s12, s12, s15 ; GFX1064-NEXT: s_addc_u32 s13, 0, s13 -; GFX1064-NEXT: s_mul_hi_u32 s16, s4, s10 +; GFX1064-NEXT: s_mul_hi_u32 s16, s8, s5 ; GFX1064-NEXT: s_add_u32 s11, s12, s11 -; GFX1064-NEXT: s_mul_i32 s10, s4, s10 +; GFX1064-NEXT: s_mul_i32 s5, s8, s5 ; GFX1064-NEXT: s_addc_u32 s11, s13, s14 ; GFX1064-NEXT: s_addc_u32 s12, s16, 0 -; GFX1064-NEXT: s_add_u32 s10, s11, s10 +; GFX1064-NEXT: s_add_u32 s5, s11, s5 ; GFX1064-NEXT: s_addc_u32 s11, 0, s12 -; GFX1064-NEXT: s_add_u32 s5, s5, s10 -; GFX1064-NEXT: s_addc_u32 s4, s4, s11 -; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s5 -; GFX1064-NEXT: s_mul_i32 s11, s8, s5 -; GFX1064-NEXT: s_mul_i32 s8, s8, s4 -; GFX1064-NEXT: s_mul_i32 s9, s9, s5 -; GFX1064-NEXT: s_add_i32 s8, s10, s8 -; GFX1064-NEXT: s_mul_i32 s10, s4, s11 -; GFX1064-NEXT: s_add_i32 s8, s8, s9 -; GFX1064-NEXT: s_mul_hi_u32 s9, s5, s11 -; GFX1064-NEXT: s_mul_i32 s14, s5, s8 -; GFX1064-NEXT: s_mul_hi_u32 s13, s5, s8 -; GFX1064-NEXT: s_add_u32 s9, s9, s14 -; GFX1064-NEXT: s_mul_hi_u32 s12, s4, s11 +; GFX1064-NEXT: s_add_u32 s12, s4, s5 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_mul_hi_u32 s13, s9, s12 +; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064-NEXT: s_mul_i32 s4, s9, s12 +; GFX1064-NEXT: s_addc_u32 s8, s8, s11 +; GFX1064-NEXT: s_mul_i32 s10, s10, s12 +; GFX1064-NEXT: s_mul_i32 s9, s9, s8 +; GFX1064-NEXT: s_mul_hi_u32 s5, s12, s4 +; GFX1064-NEXT: s_add_i32 s9, s13, s9 +; GFX1064-NEXT: s_mul_hi_u32 s11, s8, s4 +; GFX1064-NEXT: s_add_i32 s9, s9, s10 +; GFX1064-NEXT: s_mul_i32 s4, s8, s4 +; GFX1064-NEXT: s_mul_i32 s14, s12, s9 +; GFX1064-NEXT: s_mul_hi_u32 s13, s12, s9 +; GFX1064-NEXT: s_add_u32 s5, s5, s14 ; GFX1064-NEXT: s_addc_u32 s13, 0, s13 -; GFX1064-NEXT: s_mul_hi_u32 s11, s4, s8 -; GFX1064-NEXT: s_add_u32 s9, s9, s10 -; GFX1064-NEXT: s_mul_i32 s8, s4, s8 -; GFX1064-NEXT: s_addc_u32 s9, s13, s12 -; GFX1064-NEXT: s_addc_u32 s10, s11, 0 -; GFX1064-NEXT: s_add_u32 s8, s9, s8 -; GFX1064-NEXT: s_addc_u32 s9, 0, s10 -; GFX1064-NEXT: s_add_u32 s5, s5, s8 -; GFX1064-NEXT: s_addc_u32 s4, s4, s9 -; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s5 -; GFX1064-NEXT: s_mul_i32 s11, s2, s4 -; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s4 -; GFX1064-NEXT: s_mul_hi_u32 s9, s3, s5 +; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s9 +; GFX1064-NEXT: s_add_u32 s4, s5, s4 +; GFX1064-NEXT: s_mul_i32 s9, s8, s9 +; GFX1064-NEXT: s_addc_u32 s4, s13, s11 +; GFX1064-NEXT: s_addc_u32 s5, s10, 0 +; GFX1064-NEXT: s_add_u32 s4, s4, s9 +; GFX1064-NEXT: s_addc_u32 s9, 0, s5 +; GFX1064-NEXT: s_add_u32 s10, s12, s4 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_mul_hi_u32 s11, s2, s10 +; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064-NEXT: s_mul_hi_u32 s4, s3, s10 +; GFX1064-NEXT: s_addc_u32 s5, s8, s9 +; GFX1064-NEXT: s_mul_i32 s8, s3, s10 +; GFX1064-NEXT: s_mul_i32 s10, s2, s5 +; GFX1064-NEXT: s_mul_hi_u32 s9, s2, s5 +; GFX1064-NEXT: s_add_u32 s10, s11, s10 +; GFX1064-NEXT: s_addc_u32 s9, 0, s9 +; GFX1064-NEXT: s_mul_hi_u32 s12, s3, s5 +; GFX1064-NEXT: s_add_u32 s8, s10, s8 ; GFX1064-NEXT: s_mul_i32 s5, s3, s5 -; GFX1064-NEXT: s_add_u32 s8, s8, s11 -; GFX1064-NEXT: s_addc_u32 s10, 0, s10 -; GFX1064-NEXT: s_mul_hi_u32 s12, s3, s4 -; GFX1064-NEXT: s_add_u32 s5, s8, s5 -; GFX1064-NEXT: s_mul_i32 s4, s3, s4 -; GFX1064-NEXT: s_addc_u32 s5, s10, s9 +; GFX1064-NEXT: s_addc_u32 s4, s9, s4 ; GFX1064-NEXT: s_addc_u32 s8, s12, 0 -; GFX1064-NEXT: s_add_u32 s10, s5, s4 +; GFX1064-NEXT: s_add_u32 s10, s4, s5 ; GFX1064-NEXT: s_addc_u32 s11, 0, s8 ; GFX1064-NEXT: s_mul_hi_u32 s4, s0, s10 ; GFX1064-NEXT: s_mul_i32 s5, s0, s11 ; GFX1064-NEXT: s_mul_i32 s8, s1, s10 ; GFX1064-NEXT: s_add_i32 s4, s4, s5 -; GFX1064-NEXT: s_add_i32 s8, s4, s8 +; GFX1064-NEXT: s_add_i32 s12, s4, s8 ; GFX1064-NEXT: s_mul_i32 s4, s0, s10 -; GFX1064-NEXT: s_sub_i32 s9, s3, s8 -; GFX1064-NEXT: s_sub_u32 s12, s2, s4 +; GFX1064-NEXT: s_sub_i32 s8, s3, s12 +; GFX1064-NEXT: s_sub_u32 s13, s2, s4 ; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX1064-NEXT: s_subb_u32 s9, s9, s1 -; GFX1064-NEXT: s_sub_u32 s13, s12, s0 -; GFX1064-NEXT: s_subb_u32 s9, s9, 0 -; GFX1064-NEXT: s_cmp_ge_u32 s9, s1 +; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064-NEXT: s_subb_u32 s14, s8, s1 +; GFX1064-NEXT: s_sub_u32 s15, s13, s0 +; GFX1064-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX1064-NEXT: s_subb_u32 s8, s14, 0 +; GFX1064-NEXT: s_cmp_ge_u32 s8, s1 +; GFX1064-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1064-NEXT: s_cmp_ge_u32 s15, s0 ; GFX1064-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1064-NEXT: s_cmp_ge_u32 s13, s0 -; GFX1064-NEXT: s_cselect_b32 s13, -1, 0 -; GFX1064-NEXT: s_cmp_eq_u32 s9, s1 -; GFX1064-NEXT: s_cselect_b32 s9, s13, s14 -; GFX1064-NEXT: s_add_u32 s13, s10, 1 +; GFX1064-NEXT: s_cmp_eq_u32 s8, s1 +; GFX1064-NEXT: s_cselect_b32 s8, s14, s9 +; GFX1064-NEXT: s_add_u32 s9, s10, 1 ; GFX1064-NEXT: s_addc_u32 s14, s11, 0 ; GFX1064-NEXT: s_add_u32 s15, s10, 2 ; GFX1064-NEXT: s_addc_u32 s16, s11, 0 -; GFX1064-NEXT: s_cmp_lg_u32 s9, 0 -; GFX1064-NEXT: s_cselect_b32 s13, s15, s13 +; GFX1064-NEXT: s_cmp_lg_u32 s8, 0 +; GFX1064-NEXT: s_cselect_b32 s15, s15, s9 ; GFX1064-NEXT: s_cselect_b32 s14, s16, s14 ; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX1064-NEXT: s_subb_u32 s3, s3, s8 +; GFX1064-NEXT: s_subb_u32 s3, s3, s12 ; GFX1064-NEXT: s_cmp_ge_u32 s3, s1 ; GFX1064-NEXT: s_cselect_b32 s4, -1, 0 -; GFX1064-NEXT: s_cmp_ge_u32 s12, s0 +; GFX1064-NEXT: s_cmp_ge_u32 s13, s0 ; GFX1064-NEXT: s_cselect_b32 s5, -1, 0 ; GFX1064-NEXT: s_cmp_eq_u32 s3, s1 ; GFX1064-NEXT: s_cselect_b32 s1, s5, s4 ; GFX1064-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1064-NEXT: s_cselect_b32 s5, s14, s11 -; GFX1064-NEXT: s_cselect_b32 s4, s13, s10 +; GFX1064-NEXT: s_cselect_b32 s4, s15, s10 ; GFX1064-NEXT: s_cbranch_execnz .LBB15_3 ; GFX1064-NEXT: .LBB15_2: ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll index 4445383..64d055b 100644 --- a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll +++ b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll @@ -271,6 +271,7 @@ define i1 @workgroup_nonzero() { ; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX8-NEXT: s_or_b32 s4, s12, s13 ; DAGISEL-GFX8-NEXT: s_or_b32 s4, s4, s14 +; DAGISEL-GFX8-NEXT: s_cmp_lg_u32 s4, 0 ; DAGISEL-GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 ; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31] @@ -280,6 +281,7 @@ define i1 @workgroup_nonzero() { ; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX942-NEXT: s_or_b32 s0, s12, s13 ; DAGISEL-GFX942-NEXT: s_or_b32 s0, s0, s14 +; DAGISEL-GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; DAGISEL-GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 ; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; DAGISEL-GFX942-NEXT: s_setpc_b64 s[30:31] @@ -297,6 +299,8 @@ define i1 @workgroup_nonzero() { ; DAGISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0 ; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe ; DAGISEL-GFX12-NEXT: s_or_b32 s0, s0, s1 +; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe +; DAGISEL-GFX12-NEXT: s_cmp_lg_u32 s0, 0 ; DAGISEL-GFX12-NEXT: s_cselect_b32 s0, -1, 0 ; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe ; DAGISEL-GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 @@ -307,6 +311,7 @@ define i1 @workgroup_nonzero() { ; GISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX8-NEXT: s_or_b32 s4, s12, s13 ; GISEL-GFX8-NEXT: s_or_b32 s4, s4, s14 +; GISEL-GFX8-NEXT: s_cmp_lg_u32 s4, 0 ; GISEL-GFX8-NEXT: s_cselect_b32 s4, 1, 0 ; GISEL-GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GISEL-GFX8-NEXT: s_setpc_b64 s[30:31] @@ -316,6 +321,7 @@ define i1 @workgroup_nonzero() { ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX942-NEXT: s_or_b32 s0, s12, s13 ; GISEL-GFX942-NEXT: s_or_b32 s0, s0, s14 +; GISEL-GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GISEL-GFX942-NEXT: s_cselect_b32 s0, 1, 0 ; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-GFX942-NEXT: s_setpc_b64 s[30:31] @@ -333,6 +339,8 @@ define i1 @workgroup_nonzero() { ; GISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0 ; GISEL-GFX12-NEXT: s_wait_alu 0xfffe ; GISEL-GFX12-NEXT: s_or_b32 s0, s0, s1 +; GISEL-GFX12-NEXT: s_wait_alu 0xfffe +; GISEL-GFX12-NEXT: s_cmp_lg_u32 s0, 0 ; GISEL-GFX12-NEXT: s_cselect_b32 s0, 1, 0 ; GISEL-GFX12-NEXT: s_wait_alu 0xfffe ; GISEL-GFX12-NEXT: v_mov_b32_e32 v0, s0 |