[SeparateConstOffsetFromGEP] Update splitGEP to handle case where including base offset results in an offset that's too large (#177653)

Currently, separate-const-offset-from-gep tries to combine both the offsets on the base address and the offsets on the current GEP itself when it tries to separate constant offsets. This results in the pass failing to separate the offset in cases where the base address has a large offset that would cause the total offset to be larger than what the back-end can represent for the respective addressing mode. However, in many cases we can still benefit from extracting the offset of the GEP without including the offset from the base-address-GEP when the base-address is used in multiple different places to help reduce register pressure and recalculating the base address. This PR addresses the above by re-trying without accumulating the offset from the base-address GEP included if isLegalAddressingMode returns false the first time.
author: Adel Ejjeh <adel.ejjeh@amd.com> 2026-02-12 15:12:55 -0600
committer: GitHub <noreply@github.com> 2026-02-12 22:12:55 +0100
commit: 06282d9c46264c358b6e7ecc301305cdd7049e59 (patch)
tree: 9e11d69492f3a3fa30586b39202229f462b5d789
parent: e66574702479f8ecd7f2bef3e70acdb215e19cc9 (diff)
download: llvm-06282d9c46264c358b6e7ecc301305cdd7049e59.tar.gz
llvm-06282d9c46264c358b6e7ecc301305cdd7049e59.tar.bz2
llvm-06282d9c46264c358b6e7ecc301305cdd7049e59.zip
2 files changed, 108 insertions, 7 deletions
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index e01b5797e662..c298daff3010 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -979,12 +979,11 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   // offset from each is accumulated.
   Value *NewBase;
   const APInt *BaseOffset;
-  const bool ExtractBase =
-      match(GEP->getPointerOperand(),
-            m_PtrAdd(m_Value(NewBase), m_APInt(BaseOffset)));
+  bool ExtractBase = match(GEP->getPointerOperand(),
+                           m_PtrAdd(m_Value(NewBase), m_APInt(BaseOffset)));
 
   unsigned IdxWidth = DL->getIndexTypeSizeInBits(GEP->getType());
-  const APInt BaseByteOffset =
+  APInt BaseByteOffset =
       ExtractBase ? BaseOffset->sextOrTrunc(IdxWidth) : APInt(IdxWidth, 0);
 
   // The backend can already nicely handle the case where all indices are
@@ -995,8 +994,8 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   bool Changed = canonicalizeArrayIndicesToIndexSize(GEP);
 
   bool NeedsExtraction;
-  APInt AccumulativeByteOffset =
-      BaseByteOffset + accumulateByteOffset(GEP, NeedsExtraction);
+  APInt NonBaseByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
+  APInt AccumulativeByteOffset = BaseByteOffset + NonBaseByteOffset;
 
   TargetTransformInfo &TTI = GetTTI(*GEP->getFunction());
 
@@ -1018,7 +1017,21 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
             GEP->getResultElementType(),
             /*BaseGV=*/nullptr, AccumulativeByteOffset.getSExtValue(),
             /*HasBaseReg=*/true, /*Scale=*/0, AddrSpace)) {
-      return Changed;
+      // If the addressing mode was not legal and the base byte offset was not
+      // 0, it could be a case where the total offset became too large for
+      // the addressing mode. Try again without extracting the base offset.
+      if (!ExtractBase)
+        return Changed;
+      ExtractBase = false;
+      BaseByteOffset = APInt(IdxWidth, 0);
+      AccumulativeByteOffset = NonBaseByteOffset;
+      if (!TTI.isLegalAddressingMode(
+              GEP->getResultElementType(),
+              /*BaseGV=*/nullptr, AccumulativeByteOffset.getSExtValue(),
+              /*HasBaseReg=*/true, /*Scale=*/0, AddrSpace))
+        return Changed;
+      // We can proceed with just extracting the other (non-base) offsets.
+      NeedsExtraction = true;
     }
   }
 
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/amdgpu-large-lds-offset.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/amdgpu-large-lds-offset.ll
new file mode 100644
index 000000000000..8bbe2b1dc4c9
--- /dev/null
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/amdgpu-large-lds-offset.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -passes=separate-const-offset-from-gep -S %s | FileCheck %s
+
+; This test is intended to test that the separate-const-offset-from-gep
+; pass will still separate offsets even if the base offset is large.
+
+@global_smem = external addrspace(3) global [0 x i8], align 16
+
+define void @large_base_offset(ptr addrspace(1) writeonly %out, i32 %idx) {
+; CHECK-LABEL: define void @large_base_offset(
+; CHECK-SAME: ptr addrspace(1) writeonly [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], 15
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP2]], 10
+; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[TMP1]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = or disjoint i32 [[TMP5]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i32 [[TMP2]], 5
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) @global_smem, i32 67584
+; CHECK-NEXT:    [[TMP9:%.*]] = or disjoint i32 [[TMP6]], 16384
+; CHECK-NEXT:    [[TMP10:%.*]] = lshr exact i32 [[TMP9]], 5
+; CHECK-NEXT:    [[TMP11:%.*]] = and i32 [[TMP10]], 992
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP8]], i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP12]], i32 [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP13]], i32 16384
+; CHECK-NEXT:    [[TMP15:%.*]] = load <16 x half>, ptr addrspace(3) [[TMP14]], align 32
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <16 x half> [[TMP15]], <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    store <8 x half> [[TMP16]], ptr addrspace(1) [[TMP0]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %idx_low_bits = and i32 %idx, 15
+  %row_offset = shl nuw nsw i32 %idx_low_bits, 10
+  %idx_bit4 = and i32 %idx, 16
+  %idx_bit4_shifted = shl nuw nsw i32 %idx_bit4, 1
+  %combined_offset = or disjoint i32 %idx_bit4_shifted, %row_offset
+  %lane_offset = shl nuw nsw i32 %idx_low_bits, 5
+  %lds_large_base = getelementptr i8, ptr addrspace(3) @global_smem, i32 67584
+  %combined_offset_with_stride = or disjoint i32 %combined_offset, 16384
+  %aligned_offset = lshr exact i32 %combined_offset_with_stride, 5
+  %row_alignment = and i32 %aligned_offset, 992
+  %lds_vector_ptr = getelementptr inbounds nuw i8, ptr addrspace(3) %lds_large_base, i32 %combined_offset_with_stride
+  %lds_lane_ptr = getelementptr inbounds nuw i8, ptr addrspace(3) %lds_vector_ptr, i32 %row_alignment
+  %lds_wide_load = load <16 x half>, ptr addrspace(3) %lds_lane_ptr, align 32
+  %upper_half = shufflevector <16 x half> %lds_wide_load, <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <8 x half> %upper_half, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+define void @small_base_offset(ptr addrspace(1) writeonly %out, i32 %idx) {
+; CHECK-LABEL: define void @small_base_offset(
+; CHECK-SAME: ptr addrspace(1) writeonly [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], 15
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP2]], 10
+; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[TMP1]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = or disjoint i32 [[TMP5]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i32 [[TMP2]], 5
+; CHECK-NEXT:    [[TMP8:%.*]] = or disjoint i32 [[TMP6]], 16384
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr exact i32 [[TMP8]], 5
+; CHECK-NEXT:    [[TMP10:%.*]] = and i32 [[TMP9]], 992
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(3) @global_smem, i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP11]], i32 [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP12]], i32 16388
+; CHECK-NEXT:    [[TMP14:%.*]] = load <16 x half>, ptr addrspace(3) [[TMP13]], align 32
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x half> [[TMP14]], <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    store <8 x half> [[TMP15]], ptr addrspace(1) [[TMP0]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %idx_low_bits = and i32 %idx, 15
+  %row_offset = shl nuw nsw i32 %idx_low_bits, 10
+  %idx_bit4 = and i32 %idx, 16
+  %idx_bit4_shifted = shl nuw nsw i32 %idx_bit4, 1
+  %combined_offset = or disjoint i32 %idx_bit4_shifted, %row_offset
+  %lane_offset = shl nuw nsw i32 %idx_low_bits, 5
+  %lds_small_base = getelementptr i8, ptr addrspace(3) @global_smem, i32 4
+  %combined_offset_with_stride = or disjoint i32 %combined_offset, 16384
+  %aligned_offset = lshr exact i32 %combined_offset_with_stride, 5
+  %row_alignment = and i32 %aligned_offset, 992
+  %lds_vector_ptr = getelementptr inbounds nuw i8, ptr addrspace(3) %lds_small_base, i32 %combined_offset_with_stride
+  %lds_lane_ptr = getelementptr inbounds nuw i8, ptr addrspace(3) %lds_vector_ptr, i32 %row_alignment
+  %lds_wide_load = load <16 x half>, ptr addrspace(3) %lds_lane_ptr, align 32
+  %upper_half = shufflevector <16 x half> %lds_wide_load, <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <8 x half> %upper_half, ptr addrspace(1) %out, align 16
+  ret void
+}
author	Adel Ejjeh <adel.ejjeh@amd.com>	2026-02-12 15:12:55 -0600
committer	GitHub <noreply@github.com>	2026-02-12 22:12:55 +0100
commit	06282d9c46264c358b6e7ecc301305cdd7049e59 (patch)
tree	9e11d69492f3a3fa30586b39202229f462b5d789
parent	e66574702479f8ecd7f2bef3e70acdb215e19cc9 (diff)
download	llvm-06282d9c46264c358b6e7ecc301305cdd7049e59.tar.gz llvm-06282d9c46264c358b6e7ecc301305cdd7049e59.tar.bz2 llvm-06282d9c46264c358b6e7ecc301305cdd7049e59.zip