aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2024-06-27 21:17:16 +0200
committerGitHub <noreply@github.com>2024-06-27 21:17:16 +0200
commita2a73d892a3fd16a1dd2218cf142c7e99cc69ba6 (patch)
treef19d0c9aaeaea6eb3e22278837722dddac50fc21
parent918313d17d38d8723d5d81fef008538677abf9cc (diff)
downloadllvm-a2a73d892a3fd16a1dd2218cf142c7e99cc69ba6.zip
llvm-a2a73d892a3fd16a1dd2218cf142c7e99cc69ba6.tar.gz
llvm-a2a73d892a3fd16a1dd2218cf142c7e99cc69ba6.tar.bz2
AMDGPU: Fix no return atomicrmw fadd v2f16 selection for gfx908 (#96948)
We previously would always expand this with a cmpxchg loop, while it should be the same conditions as the f32 case (except for the denormal concern).
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td1
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp21
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll38
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll75
-rw-r--r--llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll14
5 files changed, 23 insertions, 126 deletions
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 98054dd..4bf8f20 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1626,6 +1626,7 @@ defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amd
}
let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in {
+defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>;
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 16fa726..12977af0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16219,13 +16219,20 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
if (Subtarget->hasGFX90AInsts() && Ty->isDoubleTy())
return ReportUnsafeHWInst(AtomicExpansionKind::None);
- if (AS != AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
- // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
- if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
- // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
- if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ if (AS != AMDGPUAS::FLAT_ADDRESS) {
+ if (Ty->isFloatTy()) {
+ // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
+ if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
+ if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ } else {
+ // gfx908
+ if (RMW->use_empty() &&
+ Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() && isHalf2(Ty))
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ }
}
// flat atomic fadd f32: gfx940, gfx11+.
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index 00a01e8..d50ba64 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -6430,26 +6430,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s8
-; GFX908-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s10, s8, 0x400
-; GFX908-NEXT: s_mov_b64 s[8:9], 0
-; GFX908-NEXT: v_mov_b32_e32 v3, s10
-; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_add_f16 v1, v2, v0
-; GFX908-NEXT: v_mov_b32_e32 v5, v2
-; GFX908-NEXT: v_mov_b32_e32 v4, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
-; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX908-NEXT: v_mov_b32_e32 v2, v4
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX908-NEXT: s_cbranch_execnz .LBB20_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
@@ -7912,26 +7895,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s8
-; GFX908-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s10, s8, 0x400
-; GFX908-NEXT: s_mov_b64 s[8:9], 0
-; GFX908-NEXT: v_mov_b32_e32 v3, s10
-; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_add_f16 v1, v2, v0
-; GFX908-NEXT: v_mov_b32_e32 v5, v2
-; GFX908-NEXT: v_mov_b32_e32 v4, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
-; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX908-NEXT: v_mov_b32_e32 v2, v4
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX908-NEXT: s_cbranch_execnz .LBB25_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index 6276200..e312b37 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -15550,22 +15550,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB67_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB67_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
@@ -15771,22 +15758,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB68_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB68_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -15995,22 +15969,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB69_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB69_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -16917,22 +16878,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB73_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB73_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
@@ -17368,22 +17316,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB75_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB75_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
index 533f0f2..021a55f 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
@@ -5750,19 +5750,7 @@ define void @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(ptr addrspace(
; GFX9-NEXT: ret void
;
; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(
-; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4
-; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
-; GFX908: atomicrmw.start:
-; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
-; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX908: atomicrmw.end:
+; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
; GFX908-NEXT: ret void
;
; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(