diff options
| author | Adel Ejjeh <adel.ejjeh@amd.com> | 2026-02-12 15:12:55 -0600 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2026-02-12 22:12:55 +0100 |
| commit | 06282d9c46264c358b6e7ecc301305cdd7049e59 (patch) | |
| tree | 9e11d69492f3a3fa30586b39202229f462b5d789 | |
| parent | e66574702479f8ecd7f2bef3e70acdb215e19cc9 (diff) | |
| download | llvm-06282d9c46264c358b6e7ecc301305cdd7049e59.tar.gz llvm-06282d9c46264c358b6e7ecc301305cdd7049e59.tar.bz2 llvm-06282d9c46264c358b6e7ecc301305cdd7049e59.zip | |
[SeparateConstOffsetFromGEP] Update splitGEP to handle case where including base offset results in an offset that's too large (#177653)
Currently, separate-const-offset-from-gep tries to combine both the
offsets on the base address and the offsets on the current GEP itself
when it tries to separate constant offsets. This results in the pass
failing to separate the offset in cases where the base address has a
large offset that would cause the total offset to be larger than what
the back-end can represent for the respective addressing mode. However,
in many cases we can still benefit from extracting the offset of the GEP
without including the offset from the base-address-GEP when the
base-address is used in multiple different places to help reduce
register pressure and recalculating the base address.
This PR addresses the above by re-trying without accumulating the offset
from the base-address GEP included if isLegalAddressingMode returns
false the first time.
| -rw-r--r-- | llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp | 27 | ||||
| -rw-r--r-- | llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/amdgpu-large-lds-offset.ll | 88 |
2 files changed, 108 insertions, 7 deletions
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index e01b5797e662..c298daff3010 100644 --- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -979,12 +979,11 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { // offset from each is accumulated. Value *NewBase; const APInt *BaseOffset; - const bool ExtractBase = - match(GEP->getPointerOperand(), - m_PtrAdd(m_Value(NewBase), m_APInt(BaseOffset))); + bool ExtractBase = match(GEP->getPointerOperand(), + m_PtrAdd(m_Value(NewBase), m_APInt(BaseOffset))); unsigned IdxWidth = DL->getIndexTypeSizeInBits(GEP->getType()); - const APInt BaseByteOffset = + APInt BaseByteOffset = ExtractBase ? BaseOffset->sextOrTrunc(IdxWidth) : APInt(IdxWidth, 0); // The backend can already nicely handle the case where all indices are @@ -995,8 +994,8 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { bool Changed = canonicalizeArrayIndicesToIndexSize(GEP); bool NeedsExtraction; - APInt AccumulativeByteOffset = - BaseByteOffset + accumulateByteOffset(GEP, NeedsExtraction); + APInt NonBaseByteOffset = accumulateByteOffset(GEP, NeedsExtraction); + APInt AccumulativeByteOffset = BaseByteOffset + NonBaseByteOffset; TargetTransformInfo &TTI = GetTTI(*GEP->getFunction()); @@ -1018,7 +1017,21 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { GEP->getResultElementType(), /*BaseGV=*/nullptr, AccumulativeByteOffset.getSExtValue(), /*HasBaseReg=*/true, /*Scale=*/0, AddrSpace)) { - return Changed; + // If the addressing mode was not legal and the base byte offset was not + // 0, it could be a case where the total offset became too large for + // the addressing mode. Try again without extracting the base offset. + if (!ExtractBase) + return Changed; + ExtractBase = false; + BaseByteOffset = APInt(IdxWidth, 0); + AccumulativeByteOffset = NonBaseByteOffset; + if (!TTI.isLegalAddressingMode( + GEP->getResultElementType(), + /*BaseGV=*/nullptr, AccumulativeByteOffset.getSExtValue(), + /*HasBaseReg=*/true, /*Scale=*/0, AddrSpace)) + return Changed; + // We can proceed with just extracting the other (non-base) offsets. + NeedsExtraction = true; } } diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/amdgpu-large-lds-offset.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/amdgpu-large-lds-offset.ll new file mode 100644 index 000000000000..8bbe2b1dc4c9 --- /dev/null +++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/amdgpu-large-lds-offset.ll @@ -0,0 +1,88 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -passes=separate-const-offset-from-gep -S %s | FileCheck %s + +; This test is intended to test that the separate-const-offset-from-gep +; pass will still separate offsets even if the base offset is large. + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +define void @large_base_offset(ptr addrspace(1) writeonly %out, i32 %idx) { +; CHECK-LABEL: define void @large_base_offset( +; CHECK-SAME: ptr addrspace(1) writeonly [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 15 +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i32 [[TMP2]], 10 +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = or disjoint i32 [[TMP5]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP2]], 5 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) @global_smem, i32 67584 +; CHECK-NEXT: [[TMP9:%.*]] = or disjoint i32 [[TMP6]], 16384 +; CHECK-NEXT: [[TMP10:%.*]] = lshr exact i32 [[TMP9]], 5 +; CHECK-NEXT: [[TMP11:%.*]] = and i32 [[TMP10]], 992 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP8]], i32 [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP12]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP13]], i32 16384 +; CHECK-NEXT: [[TMP15:%.*]] = load <16 x half>, ptr addrspace(3) [[TMP14]], align 32 +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x half> [[TMP15]], <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> +; CHECK-NEXT: store <8 x half> [[TMP16]], ptr addrspace(1) [[TMP0]], align 16 +; CHECK-NEXT: ret void +; +entry: + %idx_low_bits = and i32 %idx, 15 + %row_offset = shl nuw nsw i32 %idx_low_bits, 10 + %idx_bit4 = and i32 %idx, 16 + %idx_bit4_shifted = shl nuw nsw i32 %idx_bit4, 1 + %combined_offset = or disjoint i32 %idx_bit4_shifted, %row_offset + %lane_offset = shl nuw nsw i32 %idx_low_bits, 5 + %lds_large_base = getelementptr i8, ptr addrspace(3) @global_smem, i32 67584 + %combined_offset_with_stride = or disjoint i32 %combined_offset, 16384 + %aligned_offset = lshr exact i32 %combined_offset_with_stride, 5 + %row_alignment = and i32 %aligned_offset, 992 + %lds_vector_ptr = getelementptr inbounds nuw i8, ptr addrspace(3) %lds_large_base, i32 %combined_offset_with_stride + %lds_lane_ptr = getelementptr inbounds nuw i8, ptr addrspace(3) %lds_vector_ptr, i32 %row_alignment + %lds_wide_load = load <16 x half>, ptr addrspace(3) %lds_lane_ptr, align 32 + %upper_half = shufflevector <16 x half> %lds_wide_load, <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + store <8 x half> %upper_half, ptr addrspace(1) %out, align 16 + ret void +} + +define void @small_base_offset(ptr addrspace(1) writeonly %out, i32 %idx) { +; CHECK-LABEL: define void @small_base_offset( +; CHECK-SAME: ptr addrspace(1) writeonly [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 15 +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i32 [[TMP2]], 10 +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = or disjoint i32 [[TMP5]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP2]], 5 +; CHECK-NEXT: [[TMP8:%.*]] = or disjoint i32 [[TMP6]], 16384 +; CHECK-NEXT: [[TMP9:%.*]] = lshr exact i32 [[TMP8]], 5 +; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP9]], 992 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr addrspace(3) @global_smem, i32 [[TMP6]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP11]], i32 [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP12]], i32 16388 +; CHECK-NEXT: [[TMP14:%.*]] = load <16 x half>, ptr addrspace(3) [[TMP13]], align 32 +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x half> [[TMP14]], <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> +; CHECK-NEXT: store <8 x half> [[TMP15]], ptr addrspace(1) [[TMP0]], align 16 +; CHECK-NEXT: ret void +; +entry: + %idx_low_bits = and i32 %idx, 15 + %row_offset = shl nuw nsw i32 %idx_low_bits, 10 + %idx_bit4 = and i32 %idx, 16 + %idx_bit4_shifted = shl nuw nsw i32 %idx_bit4, 1 + %combined_offset = or disjoint i32 %idx_bit4_shifted, %row_offset + %lane_offset = shl nuw nsw i32 %idx_low_bits, 5 + %lds_small_base = getelementptr i8, ptr addrspace(3) @global_smem, i32 4 + %combined_offset_with_stride = or disjoint i32 %combined_offset, 16384 + %aligned_offset = lshr exact i32 %combined_offset_with_stride, 5 + %row_alignment = and i32 %aligned_offset, 992 + %lds_vector_ptr = getelementptr inbounds nuw i8, ptr addrspace(3) %lds_small_base, i32 %combined_offset_with_stride + %lds_lane_ptr = getelementptr inbounds nuw i8, ptr addrspace(3) %lds_vector_ptr, i32 %row_alignment + %lds_wide_load = load <16 x half>, ptr addrspace(3) %lds_lane_ptr, align 32 + %upper_half = shufflevector <16 x half> %lds_wide_load, <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + store <8 x half> %upper_half, ptr addrspace(1) %out, align 16 + ret void +} |
