aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp112
1 files changed, 111 insertions, 1 deletions
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index a38edaf..588a55b 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -80,7 +80,9 @@ enum InstClassEnum {
TBUFFER_LOAD,
TBUFFER_STORE,
GLOBAL_LOAD,
- GLOBAL_LOAD_SADDR
+ GLOBAL_LOAD_SADDR,
+ GLOBAL_STORE,
+ GLOBAL_STORE_SADDR
};
struct AddressRegs {
@@ -244,6 +246,9 @@ private:
MachineBasicBlock::iterator
mergeGlobalLoadPair(CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore);
+ MachineBasicBlock::iterator
+ mergeGlobalStorePair(CombineInfo &CI, CombineInfo &Paired,
+ MachineBasicBlock::iterator InsertBefore);
void updateBaseAndOffset(MachineInstr &I, Register NewBase,
int32_t NewOffset) const;
@@ -316,17 +321,25 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::GLOBAL_LOAD_DWORD:
case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORD:
+ case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
return 1;
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::GLOBAL_LOAD_DWORDX2:
case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX2:
+ case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
return 2;
case AMDGPU::GLOBAL_LOAD_DWORDX3:
case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX3:
+ case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
return 3;
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::GLOBAL_LOAD_DWORDX4:
case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX4:
+ case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
return 4;
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
return 8;
@@ -421,6 +434,16 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
return GLOBAL_LOAD_SADDR;
+ case AMDGPU::GLOBAL_STORE_DWORD:
+ case AMDGPU::GLOBAL_STORE_DWORDX2:
+ case AMDGPU::GLOBAL_STORE_DWORDX3:
+ case AMDGPU::GLOBAL_STORE_DWORDX4:
+ return GLOBAL_STORE;
+ case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
+ return GLOBAL_STORE_SADDR;
}
}
@@ -464,6 +487,16 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
+ case AMDGPU::GLOBAL_STORE_DWORD:
+ case AMDGPU::GLOBAL_STORE_DWORDX2:
+ case AMDGPU::GLOBAL_STORE_DWORDX3:
+ case AMDGPU::GLOBAL_STORE_DWORDX4:
+ return AMDGPU::GLOBAL_STORE_DWORD;
+ case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
+ return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
}
}
@@ -530,12 +563,20 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
Result.SAddr = true;
LLVM_FALLTHROUGH;
case AMDGPU::GLOBAL_LOAD_DWORD:
case AMDGPU::GLOBAL_LOAD_DWORDX2:
case AMDGPU::GLOBAL_LOAD_DWORDX3:
case AMDGPU::GLOBAL_LOAD_DWORDX4:
+ case AMDGPU::GLOBAL_STORE_DWORD:
+ case AMDGPU::GLOBAL_STORE_DWORDX2:
+ case AMDGPU::GLOBAL_STORE_DWORDX3:
+ case AMDGPU::GLOBAL_STORE_DWORDX4:
Result.VAddr = true;
return Result;
}
@@ -1451,6 +1492,48 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeGlobalLoadPair(
return New;
}
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeGlobalStorePair(
+ CombineInfo &CI, CombineInfo &Paired,
+ MachineBasicBlock::iterator InsertBefore) {
+ MachineBasicBlock *MBB = CI.I->getParent();
+ DebugLoc DL = CI.I->getDebugLoc();
+
+ const unsigned Opcode = getNewOpcode(CI, Paired);
+
+ std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
+ const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
+ const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
+
+ // Copy to the new source register.
+ const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
+ Register SrcReg = MRI->createVirtualRegister(SuperRC);
+
+ const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
+ const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
+
+ BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
+ .add(*Src0)
+ .addImm(SubRegIdx0)
+ .add(*Src1)
+ .addImm(SubRegIdx1);
+
+ auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
+ .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
+ .addReg(SrcReg, RegState::Kill);
+
+ if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
+ MIB.add(*SAddr);
+
+ MachineInstr *New =
+ MIB.addImm(std::min(CI.Offset, Paired.Offset))
+ .addImm(CI.CPol)
+ .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
+
+ CI.I->eraseFromParent();
+ Paired.I->eraseFromParent();
+ return New;
+}
+
unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
const CombineInfo &Paired) {
const unsigned Width = CI.Width + Paired.Width;
@@ -1501,6 +1584,28 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
case 4:
return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
}
+ case GLOBAL_STORE:
+ switch (Width) {
+ default:
+ return 0;
+ case 2:
+ return AMDGPU::GLOBAL_STORE_DWORDX2;
+ case 3:
+ return AMDGPU::GLOBAL_STORE_DWORDX3;
+ case 4:
+ return AMDGPU::GLOBAL_STORE_DWORDX4;
+ }
+ case GLOBAL_STORE_SADDR:
+ switch (Width) {
+ default:
+ return 0;
+ case 2:
+ return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
+ case 3:
+ return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
+ case 4:
+ return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
+ }
case MIMG:
assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&
"No overlaps");
@@ -2140,6 +2245,11 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
NewMI = mergeGlobalLoadPair(CI, Paired, Where->I);
OptimizeListAgain |= CI.Width + Paired.Width < 4;
break;
+ case GLOBAL_STORE:
+ case GLOBAL_STORE_SADDR:
+ NewMI = mergeGlobalStorePair(CI, Paired, Where->I);
+ OptimizeListAgain |= CI.Width + Paired.Width < 4;
+ break;
}
CI.setMI(NewMI, *this);
CI.Order = Where->Order;