From 5e8cf0bf1182fc29469495e1e1377ad117b27172 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 13 Jun 2024 13:13:04 +0200 Subject: AMDGPU: Fix buffer intrinsic store of bfloat (#95377) --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 4 +-- .../llvm.amdgcn.raw.ptr.buffer.store.bf16.ll | 37 +++++++++++++++++++--- 2 files changed, 34 insertions(+), 7 deletions(-) (limited to 'llvm') diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 4946129..8109820 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -874,7 +874,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, - MVT::f16, MVT::i16, MVT::i8, MVT::i128}, + MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128}, Custom); setOperationAction(ISD::STACKSAVE, MVT::Other, Custom); @@ -9973,7 +9973,7 @@ SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG, EVT VDataType, SDLoc DL, SDValue Ops[], MemSDNode *M) const { - if (VDataType == MVT::f16) + if (VDataType == MVT::f16 || VDataType == MVT::bf16) Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]); SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll index f7f3742..82dd35a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll @@ -5,11 +5,38 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11 %s -; FIXME -; define amdgpu_ps void @buffer_store_bf16(ptr addrspace(8) inreg %rsrc, bfloat %data, i32 %offset) { -; call void @llvm.amdgcn.raw.ptr.buffer.store.bf16(bfloat %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) -; ret void -; } +define amdgpu_ps void @buffer_store_bf16(ptr addrspace(8) inreg %rsrc, bfloat %data, i32 %offset) { +; GFX7-LABEL: buffer_store_bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: buffer_store_bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: buffer_store_bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: buffer_store_bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: buffer_store_b16 v0, v1, s[0:3], 0 offen +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + call void @llvm.amdgcn.raw.ptr.buffer.store.bf16(bfloat %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) + ret void +} define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x bfloat> %data, i32 %offset) { ; GFX7-LABEL: buffer_store_v2bf16: -- cgit v1.1