diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll | 436 |
1 files changed, 436 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll new file mode 100644 index 0000000..64392a1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll @@ -0,0 +1,436 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GISEL %s + +define amdgpu_ps float @flat_load_b32_idxprom(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_load_b32_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr %p, i64 %idxprom + %ret = load float, ptr %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @flat_load_b32_idx32(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_load_b32_idx32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %arrayidx = getelementptr inbounds float, ptr %p, i32 %idx + %ret = load float, ptr %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @flat_load_b32_idxprom_wrong_stride(ptr align 4 inreg %p, i32 %idx) { +; SDAG-LABEL: flat_load_b32_idxprom_wrong_stride: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1] +; SDAG-NEXT: flat_load_b32 v0, v[0:1] +; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: flat_load_b32_idxprom_wrong_stride: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 3, v[0:1] +; GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo +; GISEL-NEXT: flat_load_b32 v0, v[0:1] +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr %p, i64 %idxprom + %ret = load float, ptr %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @flat_load_b16_idxprom_ioffset(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_load_b16_idxprom_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_u16 v0, v0, s[0:1] offset:32 scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxadd + %ld = load i16, ptr %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps <2 x float> @flat_load_b64_idxprom(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_load_b64_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b64 v[0:1], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr %p, i64 %idxprom + %ret = load <2 x float>, ptr %arrayidx, align 4 + ret <2 x float> %ret +} + +define amdgpu_ps <3 x float> @flat_load_b96_idxprom(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_load_b96_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxprom + %ret = load <3 x float>, ptr %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <3 x float> @flat_load_b96_idxpromi_ioffset(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_load_b96_idxpromi_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxadd + %ret = load <3 x float>, ptr %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <4 x float> @flat_load_b128_idxprom(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_load_b128_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b128 v[0:3], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds <4 x float>, ptr %p, i64 %idxprom + %ret = load <4 x float>, ptr %arrayidx, align 4 + ret <4 x float> %ret +} + +define amdgpu_ps float @flat_load_b32_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b32_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr %p, i64 %idxprom + %ret = load float, ptr %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @flat_load_b32_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b32_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] offset:64 scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds float, ptr %p, i64 %idxadd + %ret = load float, ptr %arrayidx, align 4 + ret float %ret +} + +; Note: this is a byte load, there is nothing to scale + +define amdgpu_ps float @flat_load_b8_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b8_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_u8 v0, v0, s[0:1] offset:16 +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i8, ptr %p, i64 %idxadd + %ld = load i8, ptr %arrayidx + %ret.i32 = zext i8 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps float @flat_load_b16_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b16_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_u16 v0, v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxprom + %ld = load i16, ptr %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps float @flat_load_b16_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b16_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_u16 v0, v0, s[0:1] offset:32 scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxadd + %ld = load i16, ptr %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps <2 x float> @flat_load_b64_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b64_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_b64 v[0:1], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr %p, i64 %idxprom + %ret = load <2 x float>, ptr %arrayidx, align 4 + ret <2 x float> %ret +} + +define amdgpu_ps <3 x float> @flat_load_b96_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b96_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxprom + %ret = load <3 x float>, ptr %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <3 x float> @flat_load_b96_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b96_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxadd + %ret = load <3 x float>, ptr %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <4 x float> @flat_load_b128_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b128_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_b128 v[0:3], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds <4 x float>, ptr %p, i64 %idxprom + %ret = load <4 x float>, ptr %arrayidx, align 4 + ret <4 x float> %ret +} + +define amdgpu_ps void @flat_store_b32_idxprom(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_store_b32_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v1, 1.0 +; GCN-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset +; GCN-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr %p, i64 %idxprom + store float 1.0, ptr %arrayidx, align 4 + ret void +} + +define amdgpu_ps void @flat_store_b16_idxprom(ptr align 2 inreg %p, i32 %idx) { +; GCN-LABEL: flat_store_b16_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: flat_store_b16 v0, v1, s[0:1] scale_offset +; GCN-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxprom + store i16 1, ptr %arrayidx, align 2 + ret void +} + +define amdgpu_ps void @flat_store_b64_idxprom(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_store_b64_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b64_e32 v[2:3], 1.0 +; GCN-NEXT: flat_store_b64 v0, v[2:3], s[0:1] scale_offset +; GCN-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds double, ptr %p, i64 %idxprom + store double 1.0, ptr %arrayidx, align 4 + ret void +} + +define amdgpu_ps void @flat_atomicrmw_b32_idxprom(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_atomicrmw_b32_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: flat_atomic_add_u32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS +; GCN-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds i32, ptr %p, i64 %idxprom + atomicrmw add ptr %arrayidx, i32 1 monotonic + ret void +} + +define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %p, i32 %idx) { +; SDAG-LABEL: flat_atomicrmw_b64_rtn_idxprom: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 3, s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; SDAG-NEXT: s_mov_b32 s0, exec_lo +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: s_wait_alu 0xfffe +; SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; SDAG-NEXT: s_cbranch_execnz .LBB21_3 +; SDAG-NEXT: ; %bb.1: ; %Flow +; SDAG-NEXT: s_wait_alu 0xfffe +; SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; SDAG-NEXT: s_cbranch_execnz .LBB21_4 +; SDAG-NEXT: .LBB21_2: ; %atomicrmw.phi +; SDAG-NEXT: s_wait_alu 0xfffe +; SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-NEXT: s_branch .LBB21_5 +; SDAG-NEXT: .LBB21_3: ; %atomicrmw.global +; SDAG-NEXT: v_mov_b64_e32 v[0:1], 1 +; SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: s_wait_xcnt 0x0 +; SDAG-NEXT: s_wait_alu 0xfffe +; SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; SDAG-NEXT: s_cbranch_execz .LBB21_2 +; SDAG-NEXT: .LBB21_4: ; %atomicrmw.private +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc_lo +; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; SDAG-NEXT: s_wait_loadcnt 0x0 +; SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1 +; SDAG-NEXT: scratch_store_b64 v4, v[2:3], off +; SDAG-NEXT: s_wait_xcnt 0x0 +; SDAG-NEXT: s_wait_alu 0xfffe +; SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; SDAG-NEXT: s_branch .LBB21_5 +; SDAG-NEXT: .LBB21_5: +; +; GISEL-LABEL: flat_atomicrmw_b64_rtn_idxprom: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: v_mov_b32_e32 v2, v0 +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GISEL-NEXT: s_mov_b64 s[2:3], src_private_base +; GISEL-NEXT: s_mov_b32 s2, exec_lo +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 3, v[2:3] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v0 +; GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v1, vcc_lo +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_cmpx_ne_u32_e64 s3, v5 +; GISEL-NEXT: s_xor_b32 s2, exec_lo, s2 +; GISEL-NEXT: s_cbranch_execnz .LBB21_3 +; GISEL-NEXT: ; %bb.1: ; %Flow +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_and_not1_saveexec_b32 s0, s2 +; GISEL-NEXT: s_cbranch_execnz .LBB21_4 +; GISEL-NEXT: .LBB21_2: ; %atomicrmw.phi +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_branch .LBB21_5 +; GISEL-NEXT: .LBB21_3: ; %atomicrmw.global +; GISEL-NEXT: v_mov_b64_e32 v[0:1], 1 +; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GISEL-NEXT: flat_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] scale_offset th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GISEL-NEXT: s_wait_xcnt 0x0 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_and_not1_saveexec_b32 s0, s2 +; GISEL-NEXT: s_cbranch_execz .LBB21_2 +; GISEL-NEXT: .LBB21_4: ; %atomicrmw.private +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GISEL-NEXT: s_wait_alu 0xfffd +; GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: scratch_load_b64 v[0:1], v4, off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1 +; GISEL-NEXT: scratch_store_b64 v4, v[2:3], off +; GISEL-NEXT: s_wait_xcnt 0x0 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GISEL-NEXT: s_branch .LBB21_5 +; GISEL-NEXT: .LBB21_5: +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds i64, ptr %p, i64 %idxprom + %ret = atomicrmw add ptr %arrayidx, i64 1 monotonic + %ret.cast = bitcast i64 %ret to <2 x float> + ret <2 x float> %ret.cast +} + +!0 = !{i32 0, i32 1024} |