diff options
Diffstat (limited to 'llvm/lib/CodeGen/MachineBlockPlacement.cpp')
-rw-r--r-- | llvm/lib/CodeGen/MachineBlockPlacement.cpp | 161 |
1 files changed, 160 insertions, 1 deletions
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index 8a1b403..692587c 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -61,6 +61,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/CodeLayout.h" #include <algorithm> #include <cassert> #include <cstdint> @@ -193,6 +194,11 @@ static cl::opt<unsigned> TriangleChainCount( cl::init(2), cl::Hidden); +static cl::opt<bool> EnableExtTspBlockPlacement( + "enable-ext-tsp-block-placement", cl::Hidden, cl::init(false), + cl::desc("Enable machine block placement based on the ext-tsp model, " + "optimizing I-cache utilization.")); + namespace llvm { extern cl::opt<unsigned> StaticLikelyProb; extern cl::opt<unsigned> ProfileLikelyProb; @@ -557,6 +563,15 @@ class MachineBlockPlacement : public MachineFunctionPass { /// but a local analysis would not find them. void precomputeTriangleChains(); + /// Apply a post-processing step optimizing block placement. + void applyExtTsp(); + + /// Modify the existing block placement in the function and adjust all jumps. + void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder); + + /// Create a single CFG chain from the current block order. + void createCFGChainExtTsp(); + public: static char ID; // Pass identification, replacement for typeid @@ -3387,6 +3402,15 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { } } + // Apply a post-processing optimizing block placement. + if (MF.size() >= 3 && EnableExtTspBlockPlacement) { + // Find a new placement and modify the layout of the blocks in the function. + applyExtTsp(); + + // Re-create CFG chain so that we can optimizeBranches and alignBlocks. + createCFGChainExtTsp(); + } + optimizeBranches(); alignBlocks(); @@ -3413,12 +3437,147 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { MBFI->view("MBP." + MF.getName(), false); } - // We always return true as we have no way to track whether the final order // differs from the original order. return true; } +void MachineBlockPlacement::applyExtTsp() { + // Prepare data; blocks are indexed by their index in the current ordering. + DenseMap<const MachineBasicBlock *, uint64_t> BlockIndex; + BlockIndex.reserve(F->size()); + std::vector<const MachineBasicBlock *> CurrentBlockOrder; + CurrentBlockOrder.reserve(F->size()); + size_t NumBlocks = 0; + for (const MachineBasicBlock &MBB : *F) { + BlockIndex[&MBB] = NumBlocks++; + CurrentBlockOrder.push_back(&MBB); + } + + auto BlockSizes = std::vector<uint64_t>(F->size()); + auto BlockCounts = std::vector<uint64_t>(F->size()); + DenseMap<std::pair<uint64_t, uint64_t>, uint64_t> JumpCounts; + for (MachineBasicBlock &MBB : *F) { + // Getting the block frequency. + BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB); + BlockCounts[BlockIndex[&MBB]] = BlockFreq.getFrequency(); + // Getting the block size: + // - approximate the size of an instruction by 4 bytes, and + // - ignore debug instructions. + // Note: getting the exact size of each block is target-dependent and can be + // done by extending the interface of MCCodeEmitter. Experimentally we do + // not see a perf improvement with the exact block sizes. + auto NonDbgInsts = + instructionsWithoutDebug(MBB.instr_begin(), MBB.instr_end()); + int NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end()); + BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts; + // Getting jump frequencies. + for (MachineBasicBlock *Succ : MBB.successors()) { + auto EP = MBPI->getEdgeProbability(&MBB, Succ); + BlockFrequency EdgeFreq = BlockFreq * EP; + auto Edge = std::make_pair(BlockIndex[&MBB], BlockIndex[Succ]); + JumpCounts[Edge] = EdgeFreq.getFrequency(); + } + } + + LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for |V| = " << F->size() + << " with profile = " << F->getFunction().hasProfileData() + << " (" << F->getName().str() << ")" + << "\n"); + LLVM_DEBUG( + dbgs() << format(" original layout score: %0.2f\n", + calcExtTspScore(BlockSizes, BlockCounts, JumpCounts))); + + // Run the layout algorithm. + auto NewOrder = applyExtTspLayout(BlockSizes, BlockCounts, JumpCounts); + std::vector<const MachineBasicBlock *> NewBlockOrder; + NewBlockOrder.reserve(F->size()); + for (uint64_t Node : NewOrder) { + NewBlockOrder.push_back(CurrentBlockOrder[Node]); + } + LLVM_DEBUG(dbgs() << format(" optimized layout score: %0.2f\n", + calcExtTspScore(NewOrder, BlockSizes, BlockCounts, + JumpCounts))); + + // Assign new block order. + assignBlockOrder(NewBlockOrder); +} + +void MachineBlockPlacement::assignBlockOrder( + const std::vector<const MachineBasicBlock *> &NewBlockOrder) { + assert(F->size() == NewBlockOrder.size() && "Incorrect size of block order"); + F->RenumberBlocks(); + + bool HasChanges = false; + for (size_t I = 0; I < NewBlockOrder.size(); I++) { + if (NewBlockOrder[I] != F->getBlockNumbered(I)) { + HasChanges = true; + break; + } + } + // Stop early if the new block order is identical to the existing one. + if (!HasChanges) + return; + + SmallVector<MachineBasicBlock *, 4> PrevFallThroughs(F->getNumBlockIDs()); + for (auto &MBB : *F) { + PrevFallThroughs[MBB.getNumber()] = MBB.getFallThrough(); + } + + // Sort basic blocks in the function according to the computed order. + DenseMap<const MachineBasicBlock *, size_t> NewIndex; + for (const MachineBasicBlock *MBB : NewBlockOrder) { + NewIndex[MBB] = NewIndex.size(); + } + F->sort([&](MachineBasicBlock &L, MachineBasicBlock &R) { + return NewIndex[&L] < NewIndex[&R]; + }); + + // Update basic block branches by inserting explicit fallthrough branches + // when required and re-optimize branches when possible. + const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo(); + SmallVector<MachineOperand, 4> Cond; + for (auto &MBB : *F) { + MachineFunction::iterator NextMBB = std::next(MBB.getIterator()); + MachineFunction::iterator EndIt = MBB.getParent()->end(); + auto *FTMBB = PrevFallThroughs[MBB.getNumber()]; + // If this block had a fallthrough before we need an explicit unconditional + // branch to that block if the fallthrough block is not adjacent to the + // block in the new order. + if (FTMBB && (NextMBB == EndIt || &*NextMBB != FTMBB)) { + TII->insertUnconditionalBranch(MBB, FTMBB, MBB.findBranchDebugLoc()); + } + + // It might be possible to optimize branches by flipping the condition. + Cond.clear(); + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + if (TII->analyzeBranch(MBB, TBB, FBB, Cond)) + continue; + MBB.updateTerminator(FTMBB); + } + +#ifndef NDEBUG + // Make sure we correctly constructed all branches. + F->verify(this, "After optimized block reordering"); +#endif +} + +void MachineBlockPlacement::createCFGChainExtTsp() { + BlockToChain.clear(); + ComputedEdges.clear(); + ChainAllocator.DestroyAll(); + + MachineBasicBlock *HeadBB = &F->front(); + BlockChain *FunctionChain = + new (ChainAllocator.Allocate()) BlockChain(BlockToChain, HeadBB); + + for (MachineBasicBlock &MBB : *F) { + if (HeadBB == &MBB) + continue; // Ignore head of the chain + FunctionChain->merge(&MBB, nullptr); + } +} + namespace { /// A pass to compute block placement statistics. |