diff options
author | Krzysztof Drewniak <Krzysztof.Drewniak@amd.com> | 2025-03-25 19:25:16 +0000 |
---|---|---|
committer | jerryyin <zhuoryin@amd.com> | 2025-03-26 20:42:29 +0000 |
commit | 8934f7db6bbb97fb5e0d911990070f628dc813dc (patch) | |
tree | 9050ad91b7867d07a219e3b5778964dc446b7aca | |
parent | 857a04cd7670b629b560ba7e67c758a0c15e0841 (diff) | |
download | llvm-users/zyin/tip-with-krzysztof-llvm-132981.zip llvm-users/zyin/tip-with-krzysztof-llvm-132981.tar.gz llvm-users/zyin/tip-with-krzysztof-llvm-132981.tar.bz2 |
[LowerBufferFatPointers] Don't lose data from unaligned < word vectorsusers/zyin/tip-with-krzysztof-llvm-132981
Previously, AMDGPULowerBufferFatPointers would always cast long
vectors of small types, such as <8 x i8> or <2 x f16> to instructions
that operate on words, like loading/storing <2 x i32> and i32,
respectively.
This transformation is correct - and correctly returns 0s for loads
where the vector is partially in bounds - only if the offset of the
vector is word-aligned. That is, supposing a buffer has a numRecords
of 8, loading a <8 x i8> starting at offset 4 will correctly return
the last word of the buffer and 4 0s. However, if one instead starts
at offset 6 or 7 (for an alignment of 2 and 1, respectively), the
hardware will mask off the entire word, causing an all-0 result.
To inhibit this surprising and undesired behavior, loads/stores of
vectors with sub-word elements that aren't aligned to at least a word
will be broken down into scalar reads and writes, preserving the
expected out-of-bounds behavior.
This transformation will still load at least one element at a time, so
a <4 x half>, align 1 load will still use buffer_load_ushort.
3 files changed, 347 insertions, 20 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index 5dd1fe1..ee23a14 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -697,8 +697,9 @@ class LegalizeBufferContentTypesVisitor /// Convert a vector or scalar type that can't be operated on by buffer /// intrinsics to one that would be legal through bitcasts and/or truncation. - /// Uses the wider of i32, i16, or i8 where possible. - Type *legalNonAggregateFor(Type *T); + /// Uses the wider of i32, i16, or i8 where possible, accounting for the + /// alignment of the load or store. + Type *legalNonAggregateFor(Type *T, Align A); Value *makeLegalNonAggregate(Value *V, Type *TargetType, const Twine &Name); Value *makeIllegalNonAggregate(Value *V, Type *OrigType, const Twine &Name); @@ -712,8 +713,10 @@ class LegalizeBufferContentTypesVisitor /// Return the [index, length] pairs into which `T` needs to be cut to form /// legal buffer load or store operations. Clears `Slices`. Creates an empty /// `Slices` for non-vector inputs and creates one slice if no slicing will be - /// needed. - void getVecSlices(Type *T, SmallVectorImpl<VecSlice> &Slices); + /// needed. If `T` is a vector of sub-word type (i8, half, etc.) and `align` + /// is less than 4, splits the load into scalar loads so that reading off the + /// end of a byte buffer doesn't lose data. + void getVecSlices(Type *T, Align A, SmallVectorImpl<VecSlice> &Slices); Value *extractSlice(Value *Vec, VecSlice S, const Twine &Name); Value *insertSlice(Value *Whole, Value *Part, VecSlice S, const Twine &Name); @@ -790,7 +793,8 @@ Value *LegalizeBufferContentTypesVisitor::vectorToArray(Value *V, return ArrayRes; } -Type *LegalizeBufferContentTypesVisitor::legalNonAggregateFor(Type *T) { +Type *LegalizeBufferContentTypesVisitor::legalNonAggregateFor(Type *T, + Align A) { TypeSize Size = DL.getTypeStoreSizeInBits(T); // Implicitly zero-extend to the next byte if needed if (!DL.typeSizeEqualsStoreSize(T)) @@ -802,15 +806,18 @@ Type *LegalizeBufferContentTypesVisitor::legalNonAggregateFor(Type *T) { return T; } unsigned ElemSize = DL.getTypeSizeInBits(ElemTy).getFixedValue(); - if (isPowerOf2_32(ElemSize) && ElemSize >= 16 && ElemSize <= 128) { + bool IsUnaligned16BitVector = ElemSize == 16 && Size > ElemSize && A < 4; + if (isPowerOf2_32(ElemSize) && ElemSize >= 16 && ElemSize <= 128 && + !IsUnaligned16BitVector) { // [vectors of] anything that's 16/32/64/128 bits can be cast and split into - // legal buffer operations. + // legal buffer operations, except that unaligned 16-bit vectors need to be + // split. return T; } Type *BestVectorElemType = nullptr; - if (Size.isKnownMultipleOf(32)) + if (Size.isKnownMultipleOf(32) && A >= Align(4)) BestVectorElemType = IRB.getInt32Ty(); - else if (Size.isKnownMultipleOf(16)) + else if (Size.isKnownMultipleOf(16) && A >= Align(2)) BestVectorElemType = IRB.getInt16Ty(); else BestVectorElemType = IRB.getInt8Ty(); @@ -883,7 +890,7 @@ Type *LegalizeBufferContentTypesVisitor::intrinsicTypeFor(Type *LegalType) { } void LegalizeBufferContentTypesVisitor::getVecSlices( - Type *T, SmallVectorImpl<VecSlice> &Slices) { + Type *T, Align A, SmallVectorImpl<VecSlice> &Slices) { Slices.clear(); auto *VT = dyn_cast<FixedVectorType>(T); if (!VT) @@ -902,6 +909,16 @@ void LegalizeBufferContentTypesVisitor::getVecSlices( // example, <3 x i64>, since that's not slicing. uint64_t ElemsPer3Words = ElemsPerWord * 3; + if (ElemBitWidth < 32 && A < Align(4)) { + // Don't use wide loads when loading unaligned vectors of 16- or 8-bit + // types, as that can cause something like a load of <4 x half> + // from %base + 6 with numRecords = 8 bytes to not load the last element + // as one might expect. + ElemsPer4Words = ElemsPer3Words = ElemsPer2Words = ElemsPerWord = 0; + if (ElemBitWidth < 16 && A < Align(2)) { + ElemsPerShort = 0; + } + } uint64_t TotalElems = VT->getNumElements(); uint64_t Index = 0; auto TrySlice = [&](unsigned MaybeLen) { @@ -1003,11 +1020,12 @@ bool LegalizeBufferContentTypesVisitor::visitLoadImpl( // Typical case + Align PartAlign = commonAlignment(OrigLI.getAlign(), AggByteOff); Type *ArrayAsVecType = scalarArrayTypeAsVector(PartType); - Type *LegalType = legalNonAggregateFor(ArrayAsVecType); + Type *LegalType = legalNonAggregateFor(ArrayAsVecType, PartAlign); SmallVector<VecSlice> Slices; - getVecSlices(LegalType, Slices); + getVecSlices(LegalType, PartAlign, Slices); bool HasSlices = Slices.size() > 1; bool IsAggPart = !AggIdxs.empty(); Value *LoadsRes; @@ -1133,13 +1151,14 @@ std::pair<bool, bool> LegalizeBufferContentTypesVisitor::visitStoreImpl( NewData = arrayToVector(NewData, ArrayAsVecType, Name); } - Type *LegalType = legalNonAggregateFor(ArrayAsVecType); + Align PartAlign = commonAlignment(OrigSI.getAlign(), AggByteOff); + Type *LegalType = legalNonAggregateFor(ArrayAsVecType, PartAlign); if (LegalType != ArrayAsVecType) { NewData = makeLegalNonAggregate(NewData, LegalType, Name); } SmallVector<VecSlice> Slices; - getVecSlices(LegalType, Slices); + getVecSlices(LegalType, PartAlign, Slices); bool NeedToSplit = Slices.size() > 1 || IsAggPart; if (!NeedToSplit) { Type *StorableType = intrinsicTypeFor(LegalType); diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll index 405058b..37eb82a 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll @@ -274,6 +274,44 @@ define void @store_v2i32(<2 x i32> %data, ptr addrspace(8) inreg %buf) { ret void } +define <2 x i32> @load_v2i32_align1(ptr addrspace(8) inreg %buf) { +; SDAG-LABEL: load_v2i32_align1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: load_v2i32_align1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_setpc_b64 s[30:31] + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <2 x i32>, ptr addrspace(7) %p, align 1 + ret <2 x i32> %ret +} + +define void @store_v2i32_align1(<2 x i32> %data, ptr addrspace(8) inreg %buf) { +; SDAG-LABEL: store_v2i32_align1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: store_v2i32_align1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_setpc_b64 s[30:31] + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <2 x i32> %data, ptr addrspace(7) %p, align 1 + ret void +} + define <3 x i32> @load_v3i32(ptr addrspace(8) inreg %buf) { ; SDAG-LABEL: load_v3i32: ; SDAG: ; %bb.0: @@ -616,6 +654,56 @@ define void @store_v2f16(<2 x half> %data, ptr addrspace(8) inreg %buf) { ret void } +define <2 x half> @load_v2f16_align2(ptr addrspace(8) inreg %buf) { +; SDAG-LABEL: load_v2f16_align2: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0 +; SDAG-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:2 +; SDAG-NEXT: s_waitcnt vmcnt(1) +; SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: load_v2f16_align2: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0 +; GISEL-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:2 +; GISEL-NEXT: s_waitcnt vmcnt(1) +; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <2 x half>, ptr addrspace(7) %p, align 2 + ret <2 x half> %ret +} + +define void @store_v2f16_align2(<2 x half> %data, ptr addrspace(8) inreg %buf) { +; SDAG-LABEL: store_v2f16_align2: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 +; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:2 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: store_v2f16_align2: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0 +; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:2 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_setpc_b64 s[30:31] + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <2 x half> %data, ptr addrspace(7) %p, align 2 + ret void +} + define <4 x bfloat> @load_v4bf16(ptr addrspace(8) inreg %buf) { ; SDAG-LABEL: load_v4bf16: ; SDAG: ; %bb.0: @@ -2391,6 +2479,72 @@ define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) inreg %buf) { ret void } +define <8 x i8> @load_v8i8_align1(ptr addrspace(8) inreg %buf) { +; SDAG-LABEL: load_v8i8_align1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 +; SDAG-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1 +; SDAG-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2 +; SDAG-NEXT: buffer_load_ubyte v3, off, s[16:19], 0 offset:3 +; SDAG-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:4 +; SDAG-NEXT: buffer_load_ubyte v5, off, s[16:19], 0 offset:5 +; SDAG-NEXT: buffer_load_ubyte v6, off, s[16:19], 0 offset:6 +; SDAG-NEXT: buffer_load_ubyte v7, off, s[16:19], 0 offset:7 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: load_v8i8_align1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 +; GISEL-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1 +; GISEL-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2 +; GISEL-NEXT: buffer_load_ubyte v3, off, s[16:19], 0 offset:3 +; GISEL-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:4 +; GISEL-NEXT: buffer_load_ubyte v5, off, s[16:19], 0 offset:5 +; GISEL-NEXT: buffer_load_ubyte v6, off, s[16:19], 0 offset:6 +; GISEL-NEXT: buffer_load_ubyte v7, off, s[16:19], 0 offset:7 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_setpc_b64 s[30:31] + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <8 x i8>, ptr addrspace(7) %p, align 1 + ret <8 x i8> %ret +} + +define void @store_v8i8_align1(<8 x i8> %data, ptr addrspace(8) inreg %buf) { +; SDAG-LABEL: store_v8i8_align1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0 +; SDAG-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1 +; SDAG-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2 +; SDAG-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3 +; SDAG-NEXT: buffer_store_byte v4, off, s[16:19], 0 offset:4 +; SDAG-NEXT: buffer_store_byte v5, off, s[16:19], 0 offset:5 +; SDAG-NEXT: buffer_store_byte v6, off, s[16:19], 0 offset:6 +; SDAG-NEXT: buffer_store_byte v7, off, s[16:19], 0 offset:7 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: store_v8i8_align1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0 +; GISEL-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1 +; GISEL-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2 +; GISEL-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3 +; GISEL-NEXT: buffer_store_byte v4, off, s[16:19], 0 offset:4 +; GISEL-NEXT: buffer_store_byte v5, off, s[16:19], 0 offset:5 +; GISEL-NEXT: buffer_store_byte v6, off, s[16:19], 0 offset:6 +; GISEL-NEXT: buffer_store_byte v7, off, s[16:19], 0 offset:7 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_setpc_b64 s[30:31] + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <8 x i8> %data, ptr addrspace(7) %p, align 1 + ret void +} + define <12 x i8> @load_v12i8(ptr addrspace(8) inreg %buf) { ; SDAG-LABEL: load_v12i8: ; SDAG: ; %bb.0: @@ -2912,12 +3066,33 @@ define [2 x half] @load_a2f16(ptr addrspace(8) inreg %buf) { ; SDAG-LABEL: load_a2f16: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0 +; SDAG-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:2 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: load_a2f16: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0 +; GISEL-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:2 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_setpc_b64 s[30:31] + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load [2 x half], ptr addrspace(7) %p + ret [2 x half] %ret +} + +define [2 x half] @load_a2f16_align4(ptr addrspace(8) inreg %buf) { +; SDAG-LABEL: load_a2f16_align4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: load_a2f16: +; GISEL-LABEL: load_a2f16_align4: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0 @@ -2925,7 +3100,7 @@ define [2 x half] @load_a2f16(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) - %ret = load [2 x half], ptr addrspace(7) %p + %ret = load [2 x half], ptr addrspace(7) %p, align 4 ret [2 x half] %ret } @@ -2933,13 +3108,34 @@ define void @store_a2f16([2 x half] %data, ptr addrspace(8) inreg %buf) { ; SDAG-LABEL: store_a2f16: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 +; SDAG-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:2 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: store_a2f16: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0 +; GISEL-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:2 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_setpc_b64 s[30:31] + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store [2 x half] %data, ptr addrspace(7) %p + ret void +} + +define void @store_a2f16_align4([2 x half] %data, ptr addrspace(8) inreg %buf) { +; SDAG-LABEL: store_a2f16_align4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: s_mov_b32 s4, 0x5040100 ; SDAG-NEXT: v_perm_b32 v0, v1, v0, s4 ; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: store_a2f16: +; GISEL-LABEL: store_a2f16_align4: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -2948,7 +3144,7 @@ define void @store_a2f16([2 x half] %data, ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) - store [2 x half] %data, ptr addrspace(7) %p + store [2 x half] %data, ptr addrspace(7) %p, align 4 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll index d18f0f8..53c0a74 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll @@ -165,6 +165,28 @@ define void @store_v2i32(<2 x i32> %data, ptr addrspace(8) inreg %buf) { ret void } +define <2 x i32> @load_v2i32_align1(ptr addrspace(8) inreg %buf) { +; CHECK-LABEL: define <2 x i32> @load_v2i32_align1( +; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <2 x i32> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <2 x i32>, ptr addrspace(7) %p, align 1 + ret <2 x i32> %ret +} + +define void @store_v2i32_align1(<2 x i32> %data, ptr addrspace(8) inreg %buf) { +; CHECK-LABEL: define void @store_v2i32_align1( +; CHECK-SAME: <2 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <2 x i32> %data, ptr addrspace(7) %p, align 1 + ret void +} + define <3 x i32> @load_v3i32(ptr addrspace(8) inreg %buf) { ; CHECK-LABEL: define <3 x i32> @load_v3i32( ; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] { @@ -363,6 +385,36 @@ define void @store_v2f16(<2 x half> %data, ptr addrspace(8) inreg %buf) { ret void } +define <2 x half> @load_v2f16_align2(ptr addrspace(8) inreg %buf) { +; CHECK-LABEL: define <2 x half> @load_v2f16_align2( +; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i16> poison, i16 [[RET_OFF_0]], i64 0 +; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0) +; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <2 x i16> [[RET_SLICE_0]], i16 [[RET_OFF_2]], i64 1 +; CHECK-NEXT: [[RET:%.*]] = bitcast <2 x i16> [[RET_SLICE_1]] to <2 x half> +; CHECK-NEXT: ret <2 x half> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <2 x half>, ptr addrspace(7) %p, align 2 + ret <2 x half> %ret +} + +define void @store_v2f16_align2(<2 x half> %data, ptr addrspace(8) inreg %buf) { +; CHECK-LABEL: define void @store_v2f16_align2( +; CHECK-SAME: <2 x half> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <2 x half> [[DATA]] to <2 x i16> +; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i16> [[DATA_LEGAL]], i64 0 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i16> [[DATA_LEGAL]], i64 1 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <2 x half> %data, ptr addrspace(7) %p, align 2 + ret void +} + define <4 x bfloat> @load_v4bf16(ptr addrspace(8) inreg %buf) { ; CHECK-LABEL: define <4 x bfloat> @load_v4bf16( ; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] { @@ -1400,6 +1452,58 @@ define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) inreg %buf) { ret void } +define <8 x i8> @load_v8i8_align1(ptr addrspace(8) inreg %buf) { +; CHECK-LABEL: define <8 x i8> @load_v8i8_align1( +; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <8 x i8> poison, i8 [[RET_OFF_0]], i64 0 +; CHECK-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0) +; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <8 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1 +; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 2, i32 0, i32 0) +; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <8 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2 +; CHECK-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0) +; CHECK-NEXT: [[RET_SLICE_3:%.*]] = insertelement <8 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3 +; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 4, i32 0, i32 0) +; CHECK-NEXT: [[RET_SLICE_4:%.*]] = insertelement <8 x i8> [[RET_SLICE_3]], i8 [[RET_OFF_4]], i64 4 +; CHECK-NEXT: [[RET_OFF_5:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 5, i32 0, i32 0) +; CHECK-NEXT: [[RET_SLICE_5:%.*]] = insertelement <8 x i8> [[RET_SLICE_4]], i8 [[RET_OFF_5]], i64 5 +; CHECK-NEXT: [[RET_OFF_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 6, i32 0, i32 0) +; CHECK-NEXT: [[RET_SLICE_6:%.*]] = insertelement <8 x i8> [[RET_SLICE_5]], i8 [[RET_OFF_6]], i64 6 +; CHECK-NEXT: [[RET_OFF_7:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 7, i32 0, i32 0) +; CHECK-NEXT: [[RET:%.*]] = insertelement <8 x i8> [[RET_SLICE_6]], i8 [[RET_OFF_7]], i64 7 +; CHECK-NEXT: ret <8 x i8> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <8 x i8>, ptr addrspace(7) %p, align 1 + ret <8 x i8> %ret +} + +define void @store_v8i8_align1(<8 x i8> %data, ptr addrspace(8) inreg %buf) { +; CHECK-LABEL: define void @store_v8i8_align1( +; CHECK-SAME: <8 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <8 x i8> [[DATA]], i64 0 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <8 x i8> [[DATA]], i64 1 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0) +; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <8 x i8> [[DATA]], i64 2 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 1 [[BUF]], i32 2, i32 0, i32 0) +; CHECK-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <8 x i8> [[DATA]], i64 3 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0) +; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <8 x i8> [[DATA]], i64 4 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 1 [[BUF]], i32 4, i32 0, i32 0) +; CHECK-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <8 x i8> [[DATA]], i64 5 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_5]], ptr addrspace(8) align 1 [[BUF]], i32 5, i32 0, i32 0) +; CHECK-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <8 x i8> [[DATA]], i64 6 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_6]], ptr addrspace(8) align 1 [[BUF]], i32 6, i32 0, i32 0) +; CHECK-NEXT: [[DATA_SLICE_7:%.*]] = extractelement <8 x i8> [[DATA]], i64 7 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_7]], ptr addrspace(8) align 1 [[BUF]], i32 7, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <8 x i8> %data, ptr addrspace(7) %p, align 1 + ret void +} + define <12 x i8> @load_v12i8(ptr addrspace(8) inreg %buf) { ; CHECK-LABEL: define <12 x i8> @load_v12i8( ; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] { @@ -1543,7 +1647,11 @@ define void @store_a2i32([2 x i32] %data, ptr addrspace(8) inreg %buf) { define [2 x half] @load_a2f16(ptr addrspace(8) inreg %buf) { ; CHECK-LABEL: define [2 x half] @load_a2f16( ; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.load.v2f16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i16> poison, i16 [[RET_OFF_0]], i64 0 +; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0) +; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <2 x i16> [[RET_SLICE_0]], i16 [[RET_OFF_2]], i64 1 +; CHECK-NEXT: [[RET_LOADABLE:%.*]] = bitcast <2 x i16> [[RET_SLICE_1]] to <2 x half> ; CHECK-NEXT: [[RET_ELEM_0:%.*]] = extractelement <2 x half> [[RET_LOADABLE]], i64 0 ; CHECK-NEXT: [[RET_AS_ARRAY_0:%.*]] = insertvalue [2 x half] poison, half [[RET_ELEM_0]], 0 ; CHECK-NEXT: [[RET_ELEM_1:%.*]] = extractelement <2 x half> [[RET_LOADABLE]], i64 1 @@ -1562,7 +1670,11 @@ define void @store_a2f16([2 x half] %data, ptr addrspace(8) inreg %buf) { ; CHECK-NEXT: [[DATA_AS_VEC_0:%.*]] = insertelement <2 x half> poison, half [[DATA_ELEM_0]], i64 0 ; CHECK-NEXT: [[DATA_ELEM_1:%.*]] = extractvalue [2 x half] [[DATA]], 1 ; CHECK-NEXT: [[DATA_AS_VEC_1:%.*]] = insertelement <2 x half> [[DATA_AS_VEC_0]], half [[DATA_ELEM_1]], i64 1 -; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> [[DATA_AS_VEC_1]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <2 x half> [[DATA_AS_VEC_1]] to <2 x i16> +; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i16> [[DATA_LEGAL]], i64 0 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i16> [[DATA_LEGAL]], i64 1 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0) ; CHECK-NEXT: ret void ; %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) |