diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2024-06-27 21:17:16 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-06-27 21:17:16 +0200 |
commit | a2a73d892a3fd16a1dd2218cf142c7e99cc69ba6 (patch) | |
tree | f19d0c9aaeaea6eb3e22278837722dddac50fc21 | |
parent | 918313d17d38d8723d5d81fef008538677abf9cc (diff) | |
download | llvm-a2a73d892a3fd16a1dd2218cf142c7e99cc69ba6.zip llvm-a2a73d892a3fd16a1dd2218cf142c7e99cc69ba6.tar.gz llvm-a2a73d892a3fd16a1dd2218cf142c7e99cc69ba6.tar.bz2 |
AMDGPU: Fix no return atomicrmw fadd v2f16 selection for gfx908 (#96948)
We previously would always expand this with a cmpxchg loop, while
it should be the same conditions as the f32 case (except for the
denormal concern).
5 files changed, 23 insertions, 126 deletions
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 98054dd..4bf8f20 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1626,6 +1626,7 @@ defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amd } let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in { +defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>; defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>; defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 16fa726..12977af0 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -16219,13 +16219,20 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { if (Subtarget->hasGFX90AInsts() && Ty->isDoubleTy()) return ReportUnsafeHWInst(AtomicExpansionKind::None); - if (AS != AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) { - // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+. - if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) - return ReportUnsafeHWInst(AtomicExpansionKind::None); - // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+. - if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) - return ReportUnsafeHWInst(AtomicExpansionKind::None); + if (AS != AMDGPUAS::FLAT_ADDRESS) { + if (Ty->isFloatTy()) { + // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+. + if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) + return ReportUnsafeHWInst(AtomicExpansionKind::None); + // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+. + if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) + return ReportUnsafeHWInst(AtomicExpansionKind::None); + } else { + // gfx908 + if (RMW->use_empty() && + Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() && isHalf2(Ty)) + return ReportUnsafeHWInst(AtomicExpansionKind::None); + } } // flat atomic fadd f32: gfx940, gfx11+. diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index 00a01e8..d50ba64 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -6430,26 +6430,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s8 -; GFX908-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s10, s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s10 -; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -7912,26 +7895,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s8 -; GFX908-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s10, s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s10 -; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB25_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 6276200..e312b37 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -15550,22 +15550,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB67_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB67_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -15771,22 +15758,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB68_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB68_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15995,22 +15969,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB69_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB69_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16917,22 +16878,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB73_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB73_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: @@ -17368,22 +17316,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB75_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB75_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll index 533f0f2..021a55f 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll @@ -5750,19 +5750,7 @@ define void @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(ptr addrspace( ; GFX9-NEXT: ret void ; ; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe( -; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 -; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX908: atomicrmw.start: -; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] -; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX908: atomicrmw.end: +; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 ; GFX908-NEXT: ret void ; ; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe( |