//===-- AMDGPURewriteAGPRCopyMFMA.cpp -------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file \brief Try to replace MFMA instructions using VGPRs with MFMA /// instructions using AGPRs. We expect MFMAs to be selected using VGPRs, and /// only use AGPRs if it helps avoid spilling. In this case, the MFMA will have /// copies between AGPRs and VGPRs and the AGPR variant of an MFMA pseudo. This /// pass will attempt to delete the cross register bank copy and replace the /// MFMA opcode. /// /// TODO: /// - Handle non-tied dst+src2 cases. We need to try to find a copy from an /// AGPR from src2, or reassign src2 to an available AGPR (which should work /// in the common case of a load). /// /// - Handle multiple MFMA uses of the same register. e.g. chained MFMAs that /// can be rewritten as a set /// /// - Update LiveIntervals incrementally instead of recomputing from scratch /// //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveRegMatrix.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/VirtRegMap.h" #include "llvm/InitializePasses.h" using namespace llvm; #define DEBUG_TYPE "amdgpu-rewrite-agpr-copy-mfma" namespace { class AMDGPURewriteAGPRCopyMFMAImpl { const GCNSubtarget &ST; const SIInstrInfo &TII; const SIRegisterInfo &TRI; MachineRegisterInfo &MRI; VirtRegMap &VRM; LiveRegMatrix &LRM; LiveIntervals &LIS; public: AMDGPURewriteAGPRCopyMFMAImpl(MachineFunction &MF, VirtRegMap &VRM, LiveRegMatrix &LRM, LiveIntervals &LIS) : ST(MF.getSubtarget()), TII(*ST.getInstrInfo()), TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM), LIS(LIS) {} /// Compute the register class constraints based on the uses of \p Reg, /// excluding uses from \p ExceptMI. This should be nearly identical to /// MachineRegisterInfo::recomputeRegClass. const TargetRegisterClass * recomputeRegClassExcept(Register Reg, const TargetRegisterClass *OldRC, const TargetRegisterClass *NewRC, const MachineInstr *ExceptMI) const; bool run(MachineFunction &MF) const; }; const TargetRegisterClass * AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExcept( Register Reg, const TargetRegisterClass *OldRC, const TargetRegisterClass *NewRC, const MachineInstr *ExceptMI) const { // Accumulate constraints from all uses. for (MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) { // Apply the effect of the given operand to NewRC. MachineInstr *MI = MO.getParent(); if (MI == ExceptMI) continue; unsigned OpNo = &MO - &MI->getOperand(0); NewRC = MI->getRegClassConstraintEffect(OpNo, NewRC, &TII, &TRI); if (!NewRC || NewRC == OldRC) return nullptr; } return NewRC; } bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { // This only applies on subtargets that have a configurable AGPR vs. VGPR // allocation. if (!ST.hasGFX90AInsts()) return false; // Early exit if no AGPRs were assigned. if (!LRM.isPhysRegUsed(AMDGPU::AGPR0)) return false; bool MadeChange = false; for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { Register VReg = Register::index2VirtReg(I); Register PhysReg = VRM.getPhys(VReg); if (!PhysReg) continue; // Find AV_* registers assigned to AGPRs. const TargetRegisterClass *VirtRegRC = MRI.getRegClass(VReg); if (!TRI.isVectorSuperClass(VirtRegRC)) continue; const TargetRegisterClass *AssignedRC = TRI.getPhysRegBaseClass(PhysReg); if (!TRI.isAGPRClass(AssignedRC)) continue; LiveInterval &LI = LIS.getInterval(VReg); // TODO: Test multiple uses for (VNInfo *VNI : LI.vnis()) { MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def); // TODO: Handle SplitKit produced copy bundles for partially defined // registers. if (!DefMI || !DefMI->isFullCopy()) continue; Register CopySrcReg = DefMI->getOperand(1).getReg(); if (!CopySrcReg.isVirtual()) continue; LiveInterval &CopySrcLI = LIS.getInterval(CopySrcReg); LiveQueryResult LRQ = CopySrcLI.Query(VNI->def.getRegSlot()); MachineInstr *CopySrcMI = LIS.getInstructionFromIndex(LRQ.valueIn()->def); if (!CopySrcMI) continue; int AGPROp = AMDGPU::getMFMASrcCVDstAGPROp(CopySrcMI->getOpcode()); if (AGPROp == -1) continue; MachineOperand *Src2 = TII.getNamedOperand(*CopySrcMI, AMDGPU::OpName::src2); // FIXME: getMinimalPhysRegClass returns a nonsense AV_* subclass instead // of an AGPR or VGPR subclass, so we can't simply use the result on the // assignment. LLVM_DEBUG({ Register Src2PhysReg = VRM.getPhys(Src2->getReg()); dbgs() << "Attempting to replace VGPR MFMA with AGPR version:" << " Dst=[" << printReg(VReg) << " => " << printReg(PhysReg, &TRI) << "], Src2=[" << printReg(Src2->getReg(), &TRI) << " => " << printReg(Src2PhysReg, &TRI) << "]: " << *CopySrcMI; }); // If the inputs are tied and the same register, we can shortcut and // directly replace the register. if (Src2->getReg() != CopySrcReg) { LLVM_DEBUG( dbgs() << "Replacing untied VGPR MFMAs with AGPR form not yet handled\n"); // TODO: Only handles the tied case for now. If the input operand is a // different register, we need to also reassign it (either by looking // for a compatible copy-from-AGPR, or by seeing if an available AGPR is // compatible with all other uses. // If we can't reassign it, we'd need to introduce a different copy // which is likely worse than the copy we'd be saving. continue; } const TargetRegisterClass *Src2VirtRegRC = MRI.getRegClass(Src2->getReg()); // We've found av = COPY (MFMA), and need to verify that we can trivially // rewrite src2 to use the new AGPR. If we can't trivially replace it, // we're going to induce as many copies as we would have emitted in the // first place, as well as need to assign another register, and need to // figure out where to put them. The live range splitting is smarter than // anything we're doing here, so trust it did something reasonable. const TargetRegisterClass *Src2ExceptRC = recomputeRegClassExcept( Src2->getReg(), Src2VirtRegRC, VirtRegRC, CopySrcMI); if (!Src2ExceptRC) continue; const TargetRegisterClass *NewSrc2ConstraintRC = TII.getRegClass(TII.get(AGPROp), Src2->getOperandNo(), &TRI, MF); // Try to constrain src2 to the replacement instruction candidate's // register class. const TargetRegisterClass *NewSrc2RC = TRI.getCommonSubClass(Src2ExceptRC, NewSrc2ConstraintRC); if (!NewSrc2RC) { // TODO: This is ignoring ther rewritable uses. e.g. a rewritable MFMA // using a rewritable MFMA can be rewritten as a pair. LLVM_DEBUG(dbgs() << "Other uses of " << printReg(Src2->getReg(), &TRI) << " are incompatible with replacement class\n"); continue; } MRI.setRegClass(VReg, AssignedRC); MRI.setRegClass(Src2->getReg(), NewSrc2RC); CopySrcMI->setDesc(TII.get(AGPROp)); // TODO: Is replacing too aggressive, fixup these instructions only? MRI.replaceRegWith(CopySrcReg, VReg); LLVM_DEBUG(dbgs() << "Replaced VGPR MFMA with AGPR: " << *CopySrcMI); // We left behind an identity copy, so delete it. LIS.RemoveMachineInstrFromMaps(*DefMI); DefMI->eraseFromParent(); LRM.unassign(CopySrcLI); // We don't need the liveness information anymore, so don't bother // updating the intervals. Just delete the stale information. // TODO: Is it worth preserving these? LIS.removeInterval(CopySrcReg); LIS.removeInterval(VReg); LIS.createAndComputeVirtRegInterval(VReg); MadeChange = true; } } return MadeChange; } class AMDGPURewriteAGPRCopyMFMALegacy : public MachineFunctionPass { public: static char ID; AMDGPURewriteAGPRCopyMFMALegacy() : MachineFunctionPass(ID) { initializeAMDGPURewriteAGPRCopyMFMALegacyPass( *PassRegistry::getPassRegistry()); } bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { return "AMDGPU Rewrite AGPR-Copy-MFMA"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); } }; } // End anonymous namespace. INITIALIZE_PASS_BEGIN(AMDGPURewriteAGPRCopyMFMALegacy, DEBUG_TYPE, "AMDGPU Rewrite AGPR-Copy-MFMA", false, false) INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy) INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy) INITIALIZE_PASS_END(AMDGPURewriteAGPRCopyMFMALegacy, DEBUG_TYPE, "AMDGPU Rewrite AGPR-Copy-MFMA", false, false) char AMDGPURewriteAGPRCopyMFMALegacy::ID = 0; char &llvm::AMDGPURewriteAGPRCopyMFMALegacyID = AMDGPURewriteAGPRCopyMFMALegacy::ID; bool AMDGPURewriteAGPRCopyMFMALegacy::runOnMachineFunction( MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; auto &VRM = getAnalysis().getVRM(); auto &LRM = getAnalysis().getLRM(); auto &LIS = getAnalysis().getLIS(); AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS); return Impl.run(MF); } PreservedAnalyses AMDGPURewriteAGPRCopyMFMAPass::run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) { VirtRegMap &VRM = MFAM.getResult(MF); LiveRegMatrix &LRM = MFAM.getResult(MF); LiveIntervals &LIS = MFAM.getResult(MF); AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS); if (!Impl.run(MF)) return PreservedAnalyses::all(); auto PA = getMachineFunctionPassPreservedAnalyses(); PA.preserveSet(); return PA; }