[AMDGPU] Avoid crashes for non-byte-sized types in PromoteAlloca (#134042)

This patch addresses three problems when promoting allocas to vectors: - Element types with size < 1 byte in allocas with a vector type caused divisions by zero. - Element types whose size doesn't match their AllocSize hit an assertion. - Access types whose size doesn't match their AllocSize hit an assertion. With this patch, we do not attempt to promote affected allocas to vectors. In principle, we could handle these cases in PromoteAlloca, e.g., by truncating and extending elements from/to their allocation size. It's however unclear if we ever encounter such cases in practice, so that doesn't seem worth the added complexity. For SWDEV-511252
author: Fabian Ritter <fabian.ritter@amd.com> 2025-04-14 09:13:54 +0200
committer: GitHub <noreply@github.com> 2025-04-14 09:13:54 +0200
commit: cf188d650ce26b4ee3e11101d844361fca15ba64 (patch)
tree: 2cd709b0ff88a7eac2e1d49bbc6adcace8f29dcd
parent: 150e7b14f9474bc4d6891faaae4de6b8c5f6c797 (diff)
download: llvm-cf188d650ce26b4ee3e11101d844361fca15ba64.zip
llvm-cf188d650ce26b4ee3e11101d844361fca15ba64.tar.gz
llvm-cf188d650ce26b4ee3e11101d844361fca15ba64.tar.bz2
2 files changed, 228 insertions, 10 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 94ecb6ba..6c01f6d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -729,6 +729,11 @@ static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
   // complicated.
   if (isa<FixedVectorType>(AccessTy)) {
     TypeSize AccTS = DL.getTypeStoreSize(AccessTy);
+    // If the type size and the store size don't match, we would need to do more
+    // than just bitcast to translate between an extracted/insertable subvectors
+    // and the accessed value.
+    if (AccTS * 8 != DL.getTypeSizeInBits(AccessTy))
+      return false;
     TypeSize VecTS = DL.getTypeStoreSize(VecTy->getElementType());
     return AccTS.isKnownMultipleOf(VecTS);
   }
@@ -813,15 +818,17 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
 
     if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
       unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
-      unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
-      // Expand vector if required to match padding of inner type,
-      // i.e. odd size subvectors.
-      // Storage size of new vector must match that of alloca for correct
-      // behaviour of byte offsets and GEP computation.
-      if (NumElems * ElementSize != AllocaSize)
-        NumElems = AllocaSize / ElementSize;
-      if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
-        VectorTy = FixedVectorType::get(ElemTy, NumElems);
+      if (ElementSize > 0) {
+        unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
+        // Expand vector if required to match padding of inner type,
+        // i.e. odd size subvectors.
+        // Storage size of new vector must match that of alloca for correct
+        // behaviour of byte offsets and GEP computation.
+        if (NumElems * ElementSize != AllocaSize)
+          NumElems = AllocaSize / ElementSize;
+        if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
+          VectorTy = FixedVectorType::get(ElemTy, NumElems);
+      }
     }
   }
 
@@ -861,7 +868,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   LLVM_DEBUG(dbgs() << "  Attempting promotion to: " << *VectorTy << "\n");
 
   Type *VecEltTy = VectorTy->getElementType();
-  unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
+  unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
+  if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
+    LLVM_DEBUG(dbgs() << "  Cannot convert to vector if the allocation size "
+                         "does not match the type's size\n");
+    return false;
+  }
+  unsigned ElementSize = ElementSizeInBits / 8;
+  assert(ElementSize > 0);
   for (auto *U : Uses) {
     Instruction *Inst = cast<Instruction>(U->getUser());
 
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-non-byte-sizes.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-non-byte-sizes.ll
new file mode 100644
index 0000000..4095347
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-non-byte-sizes.ll
@@ -0,0 +1,204 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
+
+; Check that types where the store/allocation sizes don't match the type size
+; don't crash.
+
+
+define <7 x i9> @load_elem_i9_access_7xi9() {
+; CHECK-LABEL: @load_elem_i9_access_7xi9(
+; CHECK-NEXT:    [[P:%.*]] = alloca <16 x i9>, align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    [[L:%.*]] = load <7 x i9>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <7 x i9> [[L]]
+;
+  %p = alloca <16 x i9>, align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  %l = load <7 x i9>, ptr addrspace(5) %g, align 1
+  ret <7 x i9> %l
+}
+
+define <8 x i1> @load_elem_i1_access_8xi1() {
+; CHECK-LABEL: @load_elem_i1_access_8xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca <16 x i1>, align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    [[L:%.*]] = load <8 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <8 x i1> [[L]]
+;
+  %p = alloca <16 x i1>, align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  %l = load <8 x i1>, ptr addrspace(5) %g, align 1
+  ret <8 x i1> %l
+}
+
+define <3 x i1> @load_elem_i1_access_3xi1() {
+; CHECK-LABEL: @load_elem_i1_access_3xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca <16 x i1>, align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <3 x i1> [[L]]
+;
+  %p = alloca <16 x i1>, align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  %l = load <3 x i1>, ptr addrspace(5) %g, align 1
+  ret <3 x i1> %l
+}
+
+define <3 x i1> @load_elem_i8_access_3xi1() {
+; CHECK-LABEL: @load_elem_i8_access_3xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca <8 x i8>, align 1, addrspace(5)
+; CHECK-NEXT:    store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) [[P]], align 1
+; CHECK-NEXT:    [[G:%.*]] = getelementptr <4 x i8>, ptr addrspace(5) [[P]], i64 1
+; CHECK-NEXT:    [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <3 x i1> [[L]]
+;
+  %p = alloca <8 x i8>, align 1, addrspace(5)
+  store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) %p, align 1
+  %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1
+  %l = load <3 x i1>, ptr addrspace(5) %g, align 1
+  ret <3 x i1> %l
+}
+
+; This one is actually not problematic.
+define <8 x i1> @load_elem_i8_access_8xi1() {
+; CHECK-LABEL: @load_elem_i8_access_8xi1(
+; CHECK-NEXT:    [[P:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT:    ret <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>
+;
+  %p = alloca <8 x i8>, align 1, addrspace(5)
+  store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) %p, align 1
+  %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1
+  %l = load <8 x i1>, ptr addrspace(5) %g, align 1
+  ret <8 x i1> %l
+}
+
+define <8 x i1> @storeload_elem_i1_access_8xi1() {
+; CHECK-LABEL: @storeload_elem_i1_access_8xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca <16 x i1>, align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    store <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    [[L:%.*]] = load <8 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <8 x i1> [[L]]
+;
+  %p = alloca <16 x i1>, align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  store <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, ptr addrspace(5) %g, align 1
+  %l = load <8 x i1>, ptr addrspace(5) %g, align 1
+  ret <8 x i1> %l
+}
+
+define <3 x i1> @storeload_elem_i1_access_3xi1() {
+; CHECK-LABEL: @storeload_elem_i1_access_3xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca <16 x i1>, align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <3 x i1> [[L]]
+;
+  %p = alloca <16 x i1>, align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) %g, align 1
+  %l = load <3 x i1>, ptr addrspace(5) %g, align 1
+  ret <3 x i1> %l
+}
+
+define <3 x i1> @storeload_elem_i8_access_3xi1() {
+; CHECK-LABEL: @storeload_elem_i8_access_3xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca <8 x i8>, align 1, addrspace(5)
+; CHECK-NEXT:    store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) [[P]], align 1
+; CHECK-NEXT:    [[G:%.*]] = getelementptr <4 x i8>, ptr addrspace(5) [[P]], i64 1
+; CHECK-NEXT:    store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <3 x i1> [[L]]
+;
+  %p = alloca <8 x i8>, align 1, addrspace(5)
+  store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) %p, align 1
+  %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1
+  store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) %g, align 1
+  %l = load <3 x i1>, ptr addrspace(5) %g, align 1
+  ret <3 x i1> %l
+}
+
+; This one is actually not problematic.
+define <8 x i1> @storeload_elem_i8_access_8xi1() {
+; CHECK-LABEL: @storeload_elem_i8_access_8xi1(
+; CHECK-NEXT:    [[P:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT:    ret <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>
+;
+  %p = alloca <8 x i8>, align 1, addrspace(5)
+  store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) %p, align 1
+  %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1
+  store <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, ptr addrspace(5) %g, align 1
+  %l = load <8 x i1>, ptr addrspace(5) %g, align 1
+  ret <8 x i1> %l
+}
+
+define <8 x i1> @array_of_vec_elem_i1_access_8xi1() {
+; CHECK-LABEL: @array_of_vec_elem_i1_access_8xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca [2 x <16 x i1>], align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    store <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    [[L:%.*]] = load <8 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <8 x i1> [[L]]
+;
+  %p = alloca [2 x <16 x i1>], align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  store <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, ptr addrspace(5) %g, align 1
+  %l = load <8 x i1>, ptr addrspace(5) %g, align 1
+  ret <8 x i1> %l
+}
+
+define <3 x i1> @array_of_vec_elem_i1_access_3xi1() {
+; CHECK-LABEL: @array_of_vec_elem_i1_access_3xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca [2 x <16 x i1>], align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <3 x i1> [[L]]
+;
+  %p = alloca [2 x <16 x i1>], align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) %g, align 1
+  %l = load <3 x i1>, ptr addrspace(5) %g, align 1
+  ret <3 x i1> %l
+}
+
+define <3 x i1> @array_of_vec_elem_i8_access_3xi1() {
+; CHECK-LABEL: @array_of_vec_elem_i8_access_3xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca [2 x <8 x i8>], align 1, addrspace(5)
+; CHECK-NEXT:    store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) [[P]], align 1
+; CHECK-NEXT:    [[G:%.*]] = getelementptr <4 x i8>, ptr addrspace(5) [[P]], i64 1
+; CHECK-NEXT:    store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <3 x i1> [[L]]
+;
+  %p = alloca [2 x <8 x i8>], align 1, addrspace(5)
+  store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) %p, align 1
+  %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1
+  store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) %g, align 1
+  %l = load <3 x i1>, ptr addrspace(5) %g, align 1
+  ret <3 x i1> %l
+}
+
+; This one is actually not problematic.
+define <8 x i1> @array_of_vec_elem_i8_access_8xi1() {
+; CHECK-LABEL: @array_of_vec_elem_i8_access_8xi1(
+; CHECK-NEXT:    [[P:%.*]] = freeze <16 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> [[P]], i8 1, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 2, i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[TMP2]], i8 3, i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 4, i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 5, i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 6, i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x i8> [[TMP6]], i8 7, i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x i8> [[TMP7]], i8 8, i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x i8> [[TMP8]], i8 5, i32 4
+; CHECK-NEXT:    ret <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>
+;
+  %p = alloca [2 x <8 x i8>], align 1, addrspace(5)
+  store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) %p, align 1
+  %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1
+  store <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, ptr addrspace(5) %g, align 1
+  %l = load <8 x i1>, ptr addrspace(5) %g, align 1
+  ret <8 x i1> %l
+}
author	Fabian Ritter <fabian.ritter@amd.com>	2025-04-14 09:13:54 +0200
committer	GitHub <noreply@github.com>	2025-04-14 09:13:54 +0200
commit	cf188d650ce26b4ee3e11101d844361fca15ba64 (patch)
tree	2cd709b0ff88a7eac2e1d49bbc6adcace8f29dcd
parent	150e7b14f9474bc4d6891faaae4de6b8c5f6c797 (diff)
download	llvm-cf188d650ce26b4ee3e11101d844361fca15ba64.zip llvm-cf188d650ce26b4ee3e11101d844361fca15ba64.tar.gz llvm-cf188d650ce26b4ee3e11101d844361fca15ba64.tar.bz2