diff options
author | Yuta Mukai <mukai.yuta@fujitsu.com> | 2024-06-12 10:27:35 +0900 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-06-12 10:27:35 +0900 |
commit | 0c5319e546321d7a766999e49e0ccf801ff2b3dc (patch) | |
tree | 9f58c95e1518157ffaebf73ceed691c195f291e7 /llvm/lib/CodeGen | |
parent | 46c05dfb6c98870f8416eeb9bf787d54ac806b12 (diff) | |
download | llvm-0c5319e546321d7a766999e49e0ccf801ff2b3dc.zip llvm-0c5319e546321d7a766999e49e0ccf801ff2b3dc.tar.gz llvm-0c5319e546321d7a766999e49e0ccf801ff2b3dc.tar.bz2 |
[ModuloSchedule][AArch64] Implement modulo variable expansion for pipelining (#65609)
Modulo variable expansion is a technique that resolves overlap of
variable lifetimes by unrolling. The existing implementation solves it
by making a copy by move instruction for processors with ordinary
registers such as Arm and x86. This method may result in a very large
number of move instructions, which can cause performance problems.
Modulo variable expansion is enabled by specifying -pipeliner-mve-cg. A
backend must implement some newly defined interfaces in
PipelinerLoopInfo. They were implemented for AArch64.
Discourse thread:
https://discourse.llvm.org/t/implementing-modulo-variable-expansion-for-machinepipeliner
Diffstat (limited to 'llvm/lib/CodeGen')
-rw-r--r-- | llvm/lib/CodeGen/MachinePipeliner.cpp | 9 | ||||
-rw-r--r-- | llvm/lib/CodeGen/ModuloSchedule.cpp | 640 |
2 files changed, 649 insertions, 0 deletions
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index 32f65f0..6c24cfc 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -192,6 +192,10 @@ static cl::opt<int> cl::desc("Margin representing the unused percentage of " "the register pressure limit")); +static cl::opt<bool> + MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false), + cl::desc("Use the MVE code generator for software pipelining")); + namespace llvm { // A command line option to enable the CopyToPhi DAG mutation. @@ -677,6 +681,11 @@ void SwingSchedulerDAG::schedule() { if (ExperimentalCodeGen && NewInstrChanges.empty()) { PeelingModuloScheduleExpander MSE(MF, MS, &LIS); MSE.expand(); + } else if (MVECodeGen && NewInstrChanges.empty() && + LoopPipelinerInfo->isMVEExpanderSupported() && + ModuloScheduleExpanderMVE::canApply(Loop)) { + ModuloScheduleExpanderMVE MSE(MF, MS, LIS); + MSE.expand(); } else { ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges)); MSE.expand(); diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp index b912112..0aed235 100644 --- a/llvm/lib/CodeGen/ModuloSchedule.cpp +++ b/llvm/lib/CodeGen/ModuloSchedule.cpp @@ -22,6 +22,10 @@ #define DEBUG_TYPE "pipeliner" using namespace llvm; +static cl::opt<bool> SwapBranchTargetsMVE( + "pipeliner-swap-branch-targets-mve", cl::Hidden, cl::init(false), + cl::desc("Swap target blocks of a conditional branch for MVE expander")); + void ModuloSchedule::print(raw_ostream &OS) { for (MachineInstr *MI : ScheduledInstrs) OS << "[stage " << getStage(MI) << " @" << getCycle(MI) << "c] " << *MI; @@ -2097,6 +2101,642 @@ void PeelingModuloScheduleExpander::validateAgainstModuloScheduleExpander() { MSE.cleanup(); } +MachineInstr *ModuloScheduleExpanderMVE::cloneInstr(MachineInstr *OldMI) { + MachineInstr *NewMI = MF.CloneMachineInstr(OldMI); + + // TODO: Offset information needs to be corrected. + NewMI->dropMemRefs(MF); + + return NewMI; +} + +/// Create a dedicated exit for Loop. Exit is the original exit for Loop. +/// If it is already dedicated exit, return it. Otherwise, insert a new +/// block between them and return the new block. +static MachineBasicBlock *createDedicatedExit(MachineBasicBlock *Loop, + MachineBasicBlock *Exit) { + if (Exit->pred_size() == 1) + return Exit; + + MachineFunction *MF = Loop->getParent(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + + MachineBasicBlock *NewExit = + MF->CreateMachineBasicBlock(Loop->getBasicBlock()); + MF->insert(Loop->getIterator(), NewExit); + + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + SmallVector<MachineOperand, 4> Cond; + TII->analyzeBranch(*Loop, TBB, FBB, Cond); + if (TBB == Loop) + FBB = NewExit; + else if (FBB == Loop) + TBB = NewExit; + else + llvm_unreachable("unexpected loop structure"); + TII->removeBranch(*Loop); + TII->insertBranch(*Loop, TBB, FBB, Cond, DebugLoc()); + Loop->replaceSuccessor(Exit, NewExit); + TII->insertUnconditionalBranch(*NewExit, Exit, DebugLoc()); + NewExit->addSuccessor(Exit); + + Exit->replacePhiUsesWith(Loop, NewExit); + + return NewExit; +} + +/// Insert branch code into the end of MBB. It branches to GreaterThan if the +/// remaining trip count for instructions in LastStage0Insts is greater than +/// RequiredTC, and to Otherwise otherwise. +void ModuloScheduleExpanderMVE::insertCondBranch(MachineBasicBlock &MBB, + int RequiredTC, + InstrMapTy &LastStage0Insts, + MachineBasicBlock &GreaterThan, + MachineBasicBlock &Otherwise) { + SmallVector<MachineOperand, 4> Cond; + LoopInfo->createRemainingIterationsGreaterCondition(RequiredTC, MBB, Cond, + LastStage0Insts); + + if (SwapBranchTargetsMVE) { + // Set SwapBranchTargetsMVE to true if a target prefers to replace TBB and + // FBB for optimal performance. + if (TII->reverseBranchCondition(Cond)) + llvm_unreachable("can not reverse branch condition"); + TII->insertBranch(MBB, &Otherwise, &GreaterThan, Cond, DebugLoc()); + } else { + TII->insertBranch(MBB, &GreaterThan, &Otherwise, Cond, DebugLoc()); + } +} + +/// Generate a pipelined loop that is unrolled by using MVE algorithm and any +/// other necessary blocks. The control flow is modified to execute the +/// pipelined loop if the trip count satisfies the condition, otherwise the +/// original loop. The original loop is also used to execute the remainder +/// iterations which occur due to unrolling. +void ModuloScheduleExpanderMVE::generatePipelinedLoop() { + // The control flow for pipelining with MVE: + // + // OrigPreheader: + // // The block that is originally the loop preheader + // goto Check + // + // Check: + // // Check whether the trip count satisfies the requirements to pipeline. + // if (LoopCounter > NumStages + NumUnroll - 2) + // // The minimum number of iterations to pipeline = + // // iterations executed in prolog/epilog (NumStages-1) + + // // iterations executed in one kernel run (NumUnroll) + // goto Prolog + // // fallback to the original loop + // goto NewPreheader + // + // Prolog: + // // All prolog stages. There are no direct branches to the epilogue. + // goto NewKernel + // + // NewKernel: + // // NumUnroll copies of the kernel + // if (LoopCounter > MVE-1) + // goto NewKernel + // goto Epilog + // + // Epilog: + // // All epilog stages. + // if (LoopCounter > 0) + // // The remainder is executed in the original loop + // goto NewPreheader + // goto NewExit + // + // NewPreheader: + // // Newly created preheader for the original loop. + // // The initial values of the phis in the loop are merged from two paths. + // NewInitVal = Phi OrigInitVal, Check, PipelineLastVal, Epilog + // goto OrigKernel + // + // OrigKernel: + // // The original loop block. + // if (LoopCounter != 0) + // goto OrigKernel + // goto NewExit + // + // NewExit: + // // Newly created dedicated exit for the original loop. + // // Merge values which are referenced after the loop + // Merged = Phi OrigVal, OrigKernel, PipelineVal, Epilog + // goto OrigExit + // + // OrigExit: + // // The block that is originally the loop exit. + // // If it is already deicated exit, NewExit is not created. + + // An example of where each stage is executed: + // Assume #Stages 3, #MVE 4, #Iterations 12 + // Iter 0 1 2 3 4 5 6 7 8 9 10-11 + // ------------------------------------------------- + // Stage 0 Prolog#0 + // Stage 1 0 Prolog#1 + // Stage 2 1 0 Kernel Unroll#0 Iter#0 + // Stage 2 1 0 Kernel Unroll#1 Iter#0 + // Stage 2 1 0 Kernel Unroll#2 Iter#0 + // Stage 2 1 0 Kernel Unroll#3 Iter#0 + // Stage 2 1 0 Kernel Unroll#0 Iter#1 + // Stage 2 1 0 Kernel Unroll#1 Iter#1 + // Stage 2 1 0 Kernel Unroll#2 Iter#1 + // Stage 2 1 0 Kernel Unroll#3 Iter#1 + // Stage 2 1 Epilog#0 + // Stage 2 Epilog#1 + // Stage 0-2 OrigKernel + + LoopInfo = TII->analyzeLoopForPipelining(OrigKernel); + assert(LoopInfo && "Must be able to analyze loop!"); + + calcNumUnroll(); + + Check = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock()); + Prolog = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock()); + NewKernel = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock()); + Epilog = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock()); + NewPreheader = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock()); + + MF.insert(OrigKernel->getIterator(), Check); + MF.insert(OrigKernel->getIterator(), Prolog); + MF.insert(OrigKernel->getIterator(), NewKernel); + MF.insert(OrigKernel->getIterator(), Epilog); + MF.insert(OrigKernel->getIterator(), NewPreheader); + + NewExit = createDedicatedExit(OrigKernel, OrigExit); + + NewPreheader->transferSuccessorsAndUpdatePHIs(OrigPreheader); + TII->insertUnconditionalBranch(*NewPreheader, OrigKernel, DebugLoc()); + + OrigPreheader->addSuccessor(Check); + TII->removeBranch(*OrigPreheader); + TII->insertUnconditionalBranch(*OrigPreheader, Check, DebugLoc()); + + Check->addSuccessor(Prolog); + Check->addSuccessor(NewPreheader); + + Prolog->addSuccessor(NewKernel); + + NewKernel->addSuccessor(NewKernel); + NewKernel->addSuccessor(Epilog); + + Epilog->addSuccessor(NewPreheader); + Epilog->addSuccessor(NewExit); + + InstrMapTy LastStage0Insts; + insertCondBranch(*Check, Schedule.getNumStages() + NumUnroll - 2, + LastStage0Insts, *Prolog, *NewPreheader); + + // VRMaps map (prolog/kernel/epilog phase#, original register#) to new + // register# + SmallVector<ValueMapTy> PrologVRMap, KernelVRMap, EpilogVRMap; + generateProlog(PrologVRMap); + generateKernel(PrologVRMap, KernelVRMap, LastStage0Insts); + generateEpilog(KernelVRMap, EpilogVRMap, LastStage0Insts); +} + +/// Replace MI's use operands according to the maps. +void ModuloScheduleExpanderMVE::updateInstrUse( + MachineInstr *MI, int StageNum, int PhaseNum, + SmallVectorImpl<ValueMapTy> &CurVRMap, + SmallVectorImpl<ValueMapTy> *PrevVRMap) { + // If MI is in the prolog/kernel/epilog block, CurVRMap is + // PrologVRMap/KernelVRMap/EpilogVRMap respectively. + // PrevVRMap is nullptr/PhiVRMap/KernelVRMap respectively. + // Refer to the appropriate map according to the stage difference between + // MI and the definition of an operand. + + for (MachineOperand &UseMO : MI->uses()) { + if (!UseMO.isReg() || !UseMO.getReg().isVirtual()) + continue; + int DiffStage = 0; + Register OrigReg = UseMO.getReg(); + MachineInstr *DefInst = MRI.getVRegDef(OrigReg); + if (!DefInst || DefInst->getParent() != OrigKernel) + continue; + unsigned InitReg = 0; + unsigned DefReg = OrigReg; + if (DefInst->isPHI()) { + ++DiffStage; + unsigned LoopReg; + getPhiRegs(*DefInst, OrigKernel, InitReg, LoopReg); + // LoopReg is guaranteed to be defined within the loop by canApply() + DefReg = LoopReg; + DefInst = MRI.getVRegDef(LoopReg); + } + unsigned DefStageNum = Schedule.getStage(DefInst); + DiffStage += StageNum - DefStageNum; + Register NewReg; + if (PhaseNum >= DiffStage && CurVRMap[PhaseNum - DiffStage].count(DefReg)) + // NewReg is defined in a previous phase of the same block + NewReg = CurVRMap[PhaseNum - DiffStage][DefReg]; + else if (!PrevVRMap) + // Since this is the first iteration, refer the initial register of the + // loop + NewReg = InitReg; + else + // Cases where DiffStage is larger than PhaseNum. + // If MI is in the kernel block, the value is defined by the previous + // iteration and PhiVRMap is referenced. If MI is in the epilog block, the + // value is defined in the kernel block and KernelVRMap is referenced. + NewReg = (*PrevVRMap)[PrevVRMap->size() - (DiffStage - PhaseNum)][DefReg]; + + const TargetRegisterClass *NRC = + MRI.constrainRegClass(NewReg, MRI.getRegClass(OrigReg)); + if (NRC) + UseMO.setReg(NewReg); + else { + Register SplitReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg)); + BuildMI(*OrigKernel, MI, MI->getDebugLoc(), TII->get(TargetOpcode::COPY), + SplitReg) + .addReg(NewReg); + UseMO.setReg(SplitReg); + } + } +} + +/// Return a phi if Reg is referenced by the phi. +/// canApply() guarantees that at most only one such phi exists. +static MachineInstr *getLoopPhiUser(Register Reg, MachineBasicBlock *Loop) { + for (MachineInstr &Phi : Loop->phis()) { + unsigned InitVal, LoopVal; + getPhiRegs(Phi, Loop, InitVal, LoopVal); + if (LoopVal == Reg) + return Φ + } + return nullptr; +} + +/// Generate phis for registers defined by OrigMI. +void ModuloScheduleExpanderMVE::generatePhi( + MachineInstr *OrigMI, int UnrollNum, + SmallVectorImpl<ValueMapTy> &PrologVRMap, + SmallVectorImpl<ValueMapTy> &KernelVRMap, + SmallVectorImpl<ValueMapTy> &PhiVRMap) { + int StageNum = Schedule.getStage(OrigMI); + bool UsePrologReg; + if (Schedule.getNumStages() - NumUnroll + UnrollNum - 1 >= StageNum) + UsePrologReg = true; + else if (Schedule.getNumStages() - NumUnroll + UnrollNum == StageNum) + UsePrologReg = false; + else + return; + + // Examples that show which stages are merged by phi. + // Meaning of the symbol following the stage number: + // a/b: Stages with the same letter are merged (UsePrologReg == true) + // +: Merged with the initial value (UsePrologReg == false) + // *: No phis required + // + // #Stages 3, #MVE 4 + // Iter 0 1 2 3 4 5 6 7 8 + // ----------------------------------------- + // Stage 0a Prolog#0 + // Stage 1a 0b Prolog#1 + // Stage 2* 1* 0* Kernel Unroll#0 + // Stage 2* 1* 0+ Kernel Unroll#1 + // Stage 2* 1+ 0a Kernel Unroll#2 + // Stage 2+ 1a 0b Kernel Unroll#3 + // + // #Stages 3, #MVE 2 + // Iter 0 1 2 3 4 5 6 7 8 + // ----------------------------------------- + // Stage 0a Prolog#0 + // Stage 1a 0b Prolog#1 + // Stage 2* 1+ 0a Kernel Unroll#0 + // Stage 2+ 1a 0b Kernel Unroll#1 + // + // #Stages 3, #MVE 1 + // Iter 0 1 2 3 4 5 6 7 8 + // ----------------------------------------- + // Stage 0* Prolog#0 + // Stage 1a 0b Prolog#1 + // Stage 2+ 1a 0b Kernel Unroll#0 + + for (MachineOperand &DefMO : OrigMI->defs()) { + if (!DefMO.isReg() || DefMO.isDead()) + continue; + Register OrigReg = DefMO.getReg(); + auto NewReg = KernelVRMap[UnrollNum].find(OrigReg); + if (NewReg == KernelVRMap[UnrollNum].end()) + continue; + Register CorrespondReg; + if (UsePrologReg) { + int PrologNum = Schedule.getNumStages() - NumUnroll + UnrollNum - 1; + CorrespondReg = PrologVRMap[PrologNum][OrigReg]; + } else { + MachineInstr *Phi = getLoopPhiUser(OrigReg, OrigKernel); + if (!Phi) + continue; + CorrespondReg = getInitPhiReg(*Phi, OrigKernel); + } + + assert(CorrespondReg.isValid()); + Register PhiReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg)); + BuildMI(*NewKernel, NewKernel->getFirstNonPHI(), DebugLoc(), + TII->get(TargetOpcode::PHI), PhiReg) + .addReg(NewReg->second) + .addMBB(NewKernel) + .addReg(CorrespondReg) + .addMBB(Prolog); + PhiVRMap[UnrollNum][OrigReg] = PhiReg; + } +} + +static void replacePhiSrc(MachineInstr &Phi, Register OrigReg, Register NewReg, + MachineBasicBlock *NewMBB) { + for (unsigned Idx = 1; Idx < Phi.getNumOperands(); Idx += 2) { + if (Phi.getOperand(Idx).getReg() == OrigReg) { + Phi.getOperand(Idx).setReg(NewReg); + Phi.getOperand(Idx + 1).setMBB(NewMBB); + return; + } + } +} + +/// Generate phis that merge values from multiple routes +void ModuloScheduleExpanderMVE::mergeRegUsesAfterPipeline(Register OrigReg, + Register NewReg) { + SmallVector<MachineOperand *> UsesAfterLoop; + SmallVector<MachineInstr *> LoopPhis; + for (MachineRegisterInfo::use_iterator I = MRI.use_begin(OrigReg), + E = MRI.use_end(); + I != E; ++I) { + MachineOperand &O = *I; + if (O.getParent()->getParent() != OrigKernel && + O.getParent()->getParent() != Prolog && + O.getParent()->getParent() != NewKernel && + O.getParent()->getParent() != Epilog) + UsesAfterLoop.push_back(&O); + if (O.getParent()->getParent() == OrigKernel && O.getParent()->isPHI()) + LoopPhis.push_back(O.getParent()); + } + + // Merge the route that only execute the pipelined loop (when there are no + // remaining iterations) with the route that execute the original loop. + if (!UsesAfterLoop.empty()) { + Register PhiReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg)); + BuildMI(*NewExit, NewExit->getFirstNonPHI(), DebugLoc(), + TII->get(TargetOpcode::PHI), PhiReg) + .addReg(OrigReg) + .addMBB(OrigKernel) + .addReg(NewReg) + .addMBB(Epilog); + + for (MachineOperand *MO : UsesAfterLoop) + MO->setReg(PhiReg); + + if (!LIS.hasInterval(PhiReg)) + LIS.createEmptyInterval(PhiReg); + } + + // Merge routes from the pipelined loop and the bypassed route before the + // original loop + if (!LoopPhis.empty()) { + for (MachineInstr *Phi : LoopPhis) { + unsigned InitReg, LoopReg; + getPhiRegs(*Phi, OrigKernel, InitReg, LoopReg); + Register NewInit = MRI.createVirtualRegister(MRI.getRegClass(InitReg)); + BuildMI(*NewPreheader, NewPreheader->getFirstNonPHI(), Phi->getDebugLoc(), + TII->get(TargetOpcode::PHI), NewInit) + .addReg(InitReg) + .addMBB(Check) + .addReg(NewReg) + .addMBB(Epilog); + replacePhiSrc(*Phi, InitReg, NewInit, NewPreheader); + } + } +} + +void ModuloScheduleExpanderMVE::generateProlog( + SmallVectorImpl<ValueMapTy> &PrologVRMap) { + PrologVRMap.clear(); + PrologVRMap.resize(Schedule.getNumStages() - 1); + DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap; + for (int PrologNum = 0; PrologNum < Schedule.getNumStages() - 1; + ++PrologNum) { + for (MachineInstr *MI : Schedule.getInstructions()) { + if (MI->isPHI()) + continue; + int StageNum = Schedule.getStage(MI); + if (StageNum > PrologNum) + continue; + MachineInstr *NewMI = cloneInstr(MI); + updateInstrDef(NewMI, PrologVRMap[PrologNum], false); + NewMIMap[NewMI] = {PrologNum, StageNum}; + Prolog->push_back(NewMI); + } + } + + for (auto I : NewMIMap) { + MachineInstr *MI = I.first; + int PrologNum = I.second.first; + int StageNum = I.second.second; + updateInstrUse(MI, StageNum, PrologNum, PrologVRMap, nullptr); + } + + LLVM_DEBUG({ + dbgs() << "prolog:\n"; + Prolog->dump(); + }); +} + +void ModuloScheduleExpanderMVE::generateKernel( + SmallVectorImpl<ValueMapTy> &PrologVRMap, + SmallVectorImpl<ValueMapTy> &KernelVRMap, InstrMapTy &LastStage0Insts) { + KernelVRMap.clear(); + KernelVRMap.resize(NumUnroll); + SmallVector<ValueMapTy> PhiVRMap; + PhiVRMap.resize(NumUnroll); + DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap; + DenseMap<MachineInstr *, MachineInstr *> MIMapLastStage0; + for (int UnrollNum = 0; UnrollNum < NumUnroll; ++UnrollNum) { + for (MachineInstr *MI : Schedule.getInstructions()) { + if (MI->isPHI()) + continue; + int StageNum = Schedule.getStage(MI); + MachineInstr *NewMI = cloneInstr(MI); + if (UnrollNum == NumUnroll - 1) + LastStage0Insts[MI] = NewMI; + updateInstrDef(NewMI, KernelVRMap[UnrollNum], + (UnrollNum == NumUnroll - 1 && StageNum == 0)); + generatePhi(MI, UnrollNum, PrologVRMap, KernelVRMap, PhiVRMap); + NewMIMap[NewMI] = {UnrollNum, StageNum}; + NewKernel->push_back(NewMI); + } + } + + for (auto I : NewMIMap) { + MachineInstr *MI = I.first; + int UnrollNum = I.second.first; + int StageNum = I.second.second; + updateInstrUse(MI, StageNum, UnrollNum, KernelVRMap, &PhiVRMap); + } + + // If remaining trip count is greater than NumUnroll-1, loop continues + insertCondBranch(*NewKernel, NumUnroll - 1, LastStage0Insts, *NewKernel, + *Epilog); + + LLVM_DEBUG({ + dbgs() << "kernel:\n"; + NewKernel->dump(); + }); +} + +void ModuloScheduleExpanderMVE::generateEpilog( + SmallVectorImpl<ValueMapTy> &KernelVRMap, + SmallVectorImpl<ValueMapTy> &EpilogVRMap, InstrMapTy &LastStage0Insts) { + EpilogVRMap.clear(); + EpilogVRMap.resize(Schedule.getNumStages() - 1); + DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap; + for (int EpilogNum = 0; EpilogNum < Schedule.getNumStages() - 1; + ++EpilogNum) { + for (MachineInstr *MI : Schedule.getInstructions()) { + if (MI->isPHI()) + continue; + int StageNum = Schedule.getStage(MI); + if (StageNum <= EpilogNum) + continue; + MachineInstr *NewMI = cloneInstr(MI); + updateInstrDef(NewMI, EpilogVRMap[EpilogNum], StageNum - 1 == EpilogNum); + NewMIMap[NewMI] = {EpilogNum, StageNum}; + Epilog->push_back(NewMI); + } + } + + for (auto I : NewMIMap) { + MachineInstr *MI = I.first; + int EpilogNum = I.second.first; + int StageNum = I.second.second; + updateInstrUse(MI, StageNum, EpilogNum, EpilogVRMap, &KernelVRMap); + } + + // If there are remaining iterations, they are executed in the original loop. + // Instructions related to loop control, such as loop counter comparison, + // are indicated by shouldIgnoreForPipelining() and are assumed to be placed + // in stage 0. Thus, the map is for the last one in the kernel. + insertCondBranch(*Epilog, 0, LastStage0Insts, *NewPreheader, *NewExit); + + LLVM_DEBUG({ + dbgs() << "epilog:\n"; + Epilog->dump(); + }); +} + +/// Calculate the number of unroll required and set it to NumUnroll +void ModuloScheduleExpanderMVE::calcNumUnroll() { + DenseMap<MachineInstr *, unsigned> Inst2Idx; + NumUnroll = 1; + for (unsigned I = 0; I < Schedule.getInstructions().size(); ++I) + Inst2Idx[Schedule.getInstructions()[I]] = I; + + for (MachineInstr *MI : Schedule.getInstructions()) { + if (MI->isPHI()) + continue; + int StageNum = Schedule.getStage(MI); + for (const MachineOperand &MO : MI->uses()) { + if (!MO.isReg() || !MO.getReg().isVirtual()) + continue; + MachineInstr *DefMI = MRI.getVRegDef(MO.getReg()); + if (DefMI->getParent() != OrigKernel) + continue; + + int NumUnrollLocal = 1; + if (DefMI->isPHI()) { + ++NumUnrollLocal; + // canApply() guarantees that DefMI is not phi and is an instruction in + // the loop + DefMI = MRI.getVRegDef(getLoopPhiReg(*DefMI, OrigKernel)); + } + NumUnrollLocal += StageNum - Schedule.getStage(DefMI); + if (Inst2Idx[MI] <= Inst2Idx[DefMI]) + --NumUnrollLocal; + NumUnroll = std::max(NumUnroll, NumUnrollLocal); + } + } + LLVM_DEBUG(dbgs() << "NumUnroll: " << NumUnroll << "\n"); +} + +/// Create new virtual registers for definitions of NewMI and update NewMI. +/// If the definitions are referenced after the pipelined loop, phis are +/// created to merge with other routes. +void ModuloScheduleExpanderMVE::updateInstrDef(MachineInstr *NewMI, + ValueMapTy &VRMap, + bool LastDef) { + for (MachineOperand &MO : NewMI->operands()) { + if (!MO.isReg() || !MO.getReg().isVirtual() || !MO.isDef()) + continue; + Register Reg = MO.getReg(); + const TargetRegisterClass *RC = MRI.getRegClass(Reg); + Register NewReg = MRI.createVirtualRegister(RC); + MO.setReg(NewReg); + VRMap[Reg] = NewReg; + if (LastDef) + mergeRegUsesAfterPipeline(Reg, NewReg); + } +} + +void ModuloScheduleExpanderMVE::expand() { + OrigKernel = Schedule.getLoop()->getTopBlock(); + OrigPreheader = Schedule.getLoop()->getLoopPreheader(); + OrigExit = Schedule.getLoop()->getExitBlock(); + + LLVM_DEBUG(Schedule.dump()); + + generatePipelinedLoop(); +} + +/// Check if ModuloScheduleExpanderMVE can be applied to L +bool ModuloScheduleExpanderMVE::canApply(MachineLoop &L) { + if (!L.getExitBlock()) { + LLVM_DEBUG( + dbgs() << "Can not apply MVE expander: No single exit block.\n";); + return false; + } + + MachineBasicBlock *BB = L.getTopBlock(); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + + // Put some constraints on the operands of the phis to simplify the + // transformation + DenseSet<unsigned> UsedByPhi; + for (MachineInstr &MI : BB->phis()) { + // Registers defined by phis must be used only inside the loop and be never + // used by phis. + for (MachineOperand &MO : MI.defs()) + if (MO.isReg()) + for (MachineInstr &Ref : MRI.use_instructions(MO.getReg())) + if (Ref.getParent() != BB || Ref.isPHI()) { + LLVM_DEBUG(dbgs() + << "Can not apply MVE expander: A phi result is " + "referenced outside of the loop or by phi.\n";); + return false; + } + + // A source register from the loop block must be defined inside the loop. + // A register defined inside the loop must be referenced by only one phi at + // most. + unsigned InitVal, LoopVal; + getPhiRegs(MI, MI.getParent(), InitVal, LoopVal); + if (!Register(LoopVal).isVirtual() || + MRI.getVRegDef(LoopVal)->getParent() != BB) { + LLVM_DEBUG( + dbgs() << "Can not apply MVE expander: A phi source value coming " + "from the loop is not defined in the loop.\n";); + return false; + } + if (UsedByPhi.count(LoopVal)) { + LLVM_DEBUG(dbgs() << "Can not apply MVE expander: A value defined in the " + "loop is referenced by two or more phis.\n";); + return false; + } + UsedByPhi.insert(LoopVal); + } + + return true; +} + //===----------------------------------------------------------------------===// // ModuloScheduleTestPass implementation //===----------------------------------------------------------------------===// |