diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 83 |
1 files changed, 83 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index ed2b957..2a3271b 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -79,6 +79,7 @@ enum InstClassEnum { MIMG, TBUFFER_LOAD, TBUFFER_STORE, + GLOBAL_LOAD }; struct AddressRegs { @@ -233,6 +234,9 @@ private: MachineBasicBlock::iterator mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore); + MachineBasicBlock::iterator + mergeGlobalLoadPair(CombineInfo &CI, CombineInfo &Paired, + MachineBasicBlock::iterator InsertBefore); void updateBaseAndOffset(MachineInstr &I, Register NewBase, int32_t NewOffset) const; @@ -300,10 +304,15 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { switch (Opc) { case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + case AMDGPU::GLOBAL_LOAD_DWORD: return 1; case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::GLOBAL_LOAD_DWORDX2: return 2; + case AMDGPU::GLOBAL_LOAD_DWORDX3: + return 3; case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + case AMDGPU::GLOBAL_LOAD_DWORDX4: return 4; case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: return 8; @@ -388,6 +397,11 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::DS_WRITE_B64: case AMDGPU::DS_WRITE_B64_gfx9: return DS_WRITE; + case AMDGPU::GLOBAL_LOAD_DWORD: + case AMDGPU::GLOBAL_LOAD_DWORDX2: + case AMDGPU::GLOBAL_LOAD_DWORDX3: + case AMDGPU::GLOBAL_LOAD_DWORDX4: + return GLOBAL_LOAD; } } @@ -421,6 +435,11 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; + case AMDGPU::GLOBAL_LOAD_DWORD: + case AMDGPU::GLOBAL_LOAD_DWORDX2: + case AMDGPU::GLOBAL_LOAD_DWORDX3: + case AMDGPU::GLOBAL_LOAD_DWORDX4: + return AMDGPU::GLOBAL_LOAD_DWORD; } } @@ -483,6 +502,12 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::DS_WRITE_B64_gfx9: Result.Addr = true; return Result; + case AMDGPU::GLOBAL_LOAD_DWORD: + case AMDGPU::GLOBAL_LOAD_DWORDX2: + case AMDGPU::GLOBAL_LOAD_DWORDX3: + case AMDGPU::GLOBAL_LOAD_DWORDX4: + Result.VAddr = true; + return Result; } } @@ -1364,6 +1389,49 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( return New; } +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeGlobalLoadPair( + CombineInfo &CI, CombineInfo &Paired, + MachineBasicBlock::iterator InsertBefore) { + MachineBasicBlock *MBB = CI.I->getParent(); + DebugLoc DL = CI.I->getDebugLoc(); + + const unsigned Opcode = getNewOpcode(CI, Paired); + + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); + Register DestReg = MRI->createVirtualRegister(SuperRC); + + auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); + + const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); + const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); + + MachineInstr *New = + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) + .addImm(std::min(CI.Offset, Paired.Offset)) + .addImm(CI.CPol) + .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + + std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); + + // Copy to the old destination registers. + const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); + const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); + const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); + + BuildMI(*MBB, InsertBefore, DL, CopyDesc) + .add(*Dest0) // Copy to same destination including flags and sub reg. + .addReg(DestReg, 0, SubRegIdx0); + BuildMI(*MBB, InsertBefore, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); + + CI.I->eraseFromParent(); + Paired.I->eraseFromParent(); + return New; +} + unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired) { const unsigned Width = CI.Width + Paired.Width; @@ -1392,6 +1460,17 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, case 8: return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; } + case GLOBAL_LOAD: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::GLOBAL_LOAD_DWORDX2; + case 3: + return AMDGPU::GLOBAL_LOAD_DWORDX3; + case 4: + return AMDGPU::GLOBAL_LOAD_DWORDX4; + } case MIMG: assert((countPopulation(CI.DMask | Paired.DMask) == Width) && "No overlaps"); @@ -2035,6 +2114,10 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( NewMI = mergeTBufferStorePair(CI, Paired, Where->I); OptimizeListAgain |= CI.Width + Paired.Width < 4; break; + case GLOBAL_LOAD: + NewMI = mergeGlobalLoadPair(CI, Paired, Where->I); + OptimizeListAgain |= CI.Width + Paired.Width < 4; + break; } CI.setMI(NewMI, *this); CI.Order = Where->Order; |