diff options
author | Harrison Hao <57025411+harrisonGPU@users.noreply.github.com> | 2025-08-20 21:16:25 +0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-08-20 21:16:25 +0800 |
commit | 23a5a7bef3e7d035a3bdc239243d57b41a145d76 (patch) | |
tree | 8197869b72e67eb2336aad937b4c2b77f592f3c4 /llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | |
parent | 2738828c0e7b54486a8783010a514cd6aa224d92 (diff) | |
download | llvm-23a5a7bef3e7d035a3bdc239243d57b41a145d76.zip llvm-23a5a7bef3e7d035a3bdc239243d57b41a145d76.tar.gz llvm-23a5a7bef3e7d035a3bdc239243d57b41a145d76.tar.bz2 |
[AMDGPU] Support merging 16-bit and 8-bit TBUFFER load/store instruction (#145078)
SILoadStoreOptimizer can now recognise consecutive 16-bit and 8-bit
`TBUFFER_LOAD`/`TBUFFER_STORE` instructions that each write
* a single component (`X`), or
* two components (`XY`),
and fold them into the wider native variants:
```
X + X --> XY
X + X + X + X --> XYZW
XY + XY --> XYZW
X + X + X --> XYZ
XY + X --> XYZ
```
The optimisation cuts the number of TBUFFER instructions, shrinking code
size and improving memory throughput.
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 67 |
1 files changed, 56 insertions, 11 deletions
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index e204d6b..6f2ea8a 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -878,8 +878,12 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, Offset = I->getOperand(OffsetIdx).getImm(); } - if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) + if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) { Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); + const AMDGPU::GcnBufferFormatInfo *Info = + AMDGPU::getGcnBufferFormatInfo(Format, *LSO.STM); + EltSize = Info->BitsPerComp / 8; + } Width = getOpcodeWidth(*I, *LSO.TII); @@ -1087,24 +1091,44 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); - if (!Info0) - return false; const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); - if (!Info1) - return false; if (Info0->BitsPerComp != Info1->BitsPerComp || Info0->NumFormat != Info1->NumFormat) return false; - // TODO: Should be possible to support more formats, but if format loads - // are not dword-aligned, the merged load might not be valid. - if (Info0->BitsPerComp != 32) + // For 8-bit or 16-bit formats there is no 3-component variant. + // If NumCombinedComponents is 3, try the 4-component format and use XYZ. + // Example: + // tbuffer_load_format_x + tbuffer_load_format_x + tbuffer_load_format_x + // ==> tbuffer_load_format_xyz with format:[BUF_FMT_16_16_16_16_SNORM] + unsigned NumCombinedComponents = CI.Width + Paired.Width; + if (NumCombinedComponents == 3 && CI.EltSize <= 2) + NumCombinedComponents = 4; + + if (getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, STI) == + 0) + return false; + + // Merge only when the two access ranges are strictly back-to-back, + // any gap or overlap can over-write data or leave holes. + unsigned ElemIndex0 = CI.Offset / CI.EltSize; + unsigned ElemIndex1 = Paired.Offset / Paired.EltSize; + if (ElemIndex0 + CI.Width != ElemIndex1 && + ElemIndex1 + Paired.Width != ElemIndex0) return false; - if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) + // 1-byte formats require 1-byte alignment. + // 2-byte formats require 2-byte alignment. + // 4-byte and larger formats require 4-byte alignment. + unsigned MergedBytes = CI.EltSize * NumCombinedComponents; + unsigned RequiredAlign = std::min(MergedBytes, 4u); + unsigned MinOff = std::min(CI.Offset, Paired.Offset); + if (MinOff % RequiredAlign != 0) return false; + + return true; } uint32_t EltOffset0 = CI.Offset / CI.EltSize; @@ -1634,8 +1658,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( if (Regs.VAddr) MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); + // For 8-bit or 16-bit tbuffer formats there is no 3-component encoding. + // If the combined count is 3 (e.g. X+X+X or XY+X), promote to 4 components + // and use XYZ of XYZW to enable the merge. + unsigned NumCombinedComponents = CI.Width + Paired.Width; + if (NumCombinedComponents == 3 && CI.EltSize <= 2) + NumCombinedComponents = 4; unsigned JoinedFormat = - getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); + getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, *STM); // It shouldn't be possible to get this far if the two instructions // don't have a single memoperand, because MachineInstr::mayAlias() @@ -1677,8 +1707,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( if (Regs.VAddr) MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); + // For 8-bit or 16-bit tbuffer formats there is no 3-component encoding. + // If the combined count is 3 (e.g. X+X+X or XY+X), promote to 4 components + // and use XYZ of XYZW to enable the merge. + unsigned NumCombinedComponents = CI.Width + Paired.Width; + if (NumCombinedComponents == 3 && CI.EltSize <= 2) + NumCombinedComponents = 4; unsigned JoinedFormat = - getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); + getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, *STM); // It shouldn't be possible to get this far if the two instructions // don't have a single memoperand, because MachineInstr::mayAlias() @@ -2413,6 +2449,15 @@ SILoadStoreOptimizer::collectMergeableInsts( if (Swizzled != -1 && MI.getOperand(Swizzled).getImm()) continue; + if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) { + const MachineOperand *Fmt = + TII->getNamedOperand(MI, AMDGPU::OpName::format); + if (!AMDGPU::getGcnBufferFormatInfo(Fmt->getImm(), *STM)) { + LLVM_DEBUG(dbgs() << "Skip tbuffer with unknown format: " << MI); + continue; + } + } + CombineInfo CI; CI.setMI(MI, *this); CI.Order = Order++; |