aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2024-06-13 13:13:04 +0200
committerGitHub <noreply@github.com>2024-06-13 13:13:04 +0200
commit5e8cf0bf1182fc29469495e1e1377ad117b27172 (patch)
tree540e3c9630d9d93bfd0b265fc42bd765bdf2e25d
parent7ead2d8c7e9114b3f23666209a1654939987cb30 (diff)
downloadllvm-5e8cf0bf1182fc29469495e1e1377ad117b27172.zip
llvm-5e8cf0bf1182fc29469495e1e1377ad117b27172.tar.gz
llvm-5e8cf0bf1182fc29469495e1e1377ad117b27172.tar.bz2
AMDGPU: Fix buffer intrinsic store of bfloat (#95377)
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll37
2 files changed, 34 insertions, 7 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4946129..8109820 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -874,7 +874,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
{MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
- MVT::f16, MVT::i16, MVT::i8, MVT::i128},
+ MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
Custom);
setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
@@ -9973,7 +9973,7 @@ SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
EVT VDataType, SDLoc DL,
SDValue Ops[],
MemSDNode *M) const {
- if (VDataType == MVT::f16)
+ if (VDataType == MVT::f16 || VDataType == MVT::bf16)
Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
index f7f3742..82dd35a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
@@ -5,11 +5,38 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11 %s
-; FIXME
-; define amdgpu_ps void @buffer_store_bf16(ptr addrspace(8) inreg %rsrc, bfloat %data, i32 %offset) {
-; call void @llvm.amdgcn.raw.ptr.buffer.store.bf16(bfloat %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
-; ret void
-; }
+define amdgpu_ps void @buffer_store_bf16(ptr addrspace(8) inreg %rsrc, bfloat %data, i32 %offset) {
+; GFX7-LABEL: buffer_store_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: buffer_store_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: buffer_store_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: buffer_store_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: buffer_store_bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: buffer_store_b16 v0, v1, s[0:3], 0 offen
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ call void @llvm.amdgcn.raw.ptr.buffer.store.bf16(bfloat %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+ ret void
+}
define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x bfloat> %data, i32 %offset) {
; GFX7-LABEL: buffer_store_v2bf16: