aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/CodeGen/MachineBlockPlacement.cpp
diff options
context:
space:
mode:
authorspupyrev <spupyrev@fb.com>2021-11-08 07:02:33 -0800
committerspupyrev <spupyrev@fb.com>2021-12-07 07:31:10 -0800
commitf573f6866e1872789ac4ac41870045068167417c (patch)
treec4eaf7a8e5da19ebcda6d2bdd334987b8331088c /llvm/lib/CodeGen/MachineBlockPlacement.cpp
parent976a74d7d2dbd19670614f603caf490cca892fdc (diff)
downloadllvm-f573f6866e1872789ac4ac41870045068167417c.zip
llvm-f573f6866e1872789ac4ac41870045068167417c.tar.gz
llvm-f573f6866e1872789ac4ac41870045068167417c.tar.bz2
ext-tsp basic block layout
A new basic block ordering improving existing MachineBlockPlacement. The algorithm tries to find a layout of nodes (basic blocks) of a given CFG optimizing jump locality and thus processor I-cache utilization. This is achieved via increasing the number of fall-through jumps and co-locating frequently executed nodes together. The name follows the underlying optimization problem, Extended-TSP, which is a generalization of classical (maximum) Traveling Salesmen Problem. The algorithm is a greedy heuristic that works with chains (ordered lists) of basic blocks. Initially all chains are isolated basic blocks. On every iteration, we pick a pair of chains whose merging yields the biggest increase in the ExtTSP value, which models how i-cache "friendly" a specific chain is. A pair of chains giving the maximum gain is merged into a new chain. The procedure stops when there is only one chain left, or when merging does not increase ExtTSP. In the latter case, the remaining chains are sorted by density in decreasing order. An important aspect is the way two chains are merged. Unlike earlier algorithms (e.g., based on the approach of Pettis-Hansen), two chains, X and Y, are first split into three, X1, X2, and Y. Then we consider all possible ways of gluing the three chains (e.g., X1YX2, X1X2Y, X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score. This improves the quality of the final result (the search space is larger) while keeping the implementation sufficiently fast. Differential Revision: https://reviews.llvm.org/D113424
Diffstat (limited to 'llvm/lib/CodeGen/MachineBlockPlacement.cpp')
-rw-r--r--llvm/lib/CodeGen/MachineBlockPlacement.cpp161
1 files changed, 160 insertions, 1 deletions
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 8a1b403..692587c 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -61,6 +61,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/CodeLayout.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
@@ -193,6 +194,11 @@ static cl::opt<unsigned> TriangleChainCount(
cl::init(2),
cl::Hidden);
+static cl::opt<bool> EnableExtTspBlockPlacement(
+ "enable-ext-tsp-block-placement", cl::Hidden, cl::init(false),
+ cl::desc("Enable machine block placement based on the ext-tsp model, "
+ "optimizing I-cache utilization."));
+
namespace llvm {
extern cl::opt<unsigned> StaticLikelyProb;
extern cl::opt<unsigned> ProfileLikelyProb;
@@ -557,6 +563,15 @@ class MachineBlockPlacement : public MachineFunctionPass {
/// but a local analysis would not find them.
void precomputeTriangleChains();
+ /// Apply a post-processing step optimizing block placement.
+ void applyExtTsp();
+
+ /// Modify the existing block placement in the function and adjust all jumps.
+ void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder);
+
+ /// Create a single CFG chain from the current block order.
+ void createCFGChainExtTsp();
+
public:
static char ID; // Pass identification, replacement for typeid
@@ -3387,6 +3402,15 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
}
}
+ // Apply a post-processing optimizing block placement.
+ if (MF.size() >= 3 && EnableExtTspBlockPlacement) {
+ // Find a new placement and modify the layout of the blocks in the function.
+ applyExtTsp();
+
+ // Re-create CFG chain so that we can optimizeBranches and alignBlocks.
+ createCFGChainExtTsp();
+ }
+
optimizeBranches();
alignBlocks();
@@ -3413,12 +3437,147 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
MBFI->view("MBP." + MF.getName(), false);
}
-
// We always return true as we have no way to track whether the final order
// differs from the original order.
return true;
}
+void MachineBlockPlacement::applyExtTsp() {
+ // Prepare data; blocks are indexed by their index in the current ordering.
+ DenseMap<const MachineBasicBlock *, uint64_t> BlockIndex;
+ BlockIndex.reserve(F->size());
+ std::vector<const MachineBasicBlock *> CurrentBlockOrder;
+ CurrentBlockOrder.reserve(F->size());
+ size_t NumBlocks = 0;
+ for (const MachineBasicBlock &MBB : *F) {
+ BlockIndex[&MBB] = NumBlocks++;
+ CurrentBlockOrder.push_back(&MBB);
+ }
+
+ auto BlockSizes = std::vector<uint64_t>(F->size());
+ auto BlockCounts = std::vector<uint64_t>(F->size());
+ DenseMap<std::pair<uint64_t, uint64_t>, uint64_t> JumpCounts;
+ for (MachineBasicBlock &MBB : *F) {
+ // Getting the block frequency.
+ BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
+ BlockCounts[BlockIndex[&MBB]] = BlockFreq.getFrequency();
+ // Getting the block size:
+ // - approximate the size of an instruction by 4 bytes, and
+ // - ignore debug instructions.
+ // Note: getting the exact size of each block is target-dependent and can be
+ // done by extending the interface of MCCodeEmitter. Experimentally we do
+ // not see a perf improvement with the exact block sizes.
+ auto NonDbgInsts =
+ instructionsWithoutDebug(MBB.instr_begin(), MBB.instr_end());
+ int NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
+ BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts;
+ // Getting jump frequencies.
+ for (MachineBasicBlock *Succ : MBB.successors()) {
+ auto EP = MBPI->getEdgeProbability(&MBB, Succ);
+ BlockFrequency EdgeFreq = BlockFreq * EP;
+ auto Edge = std::make_pair(BlockIndex[&MBB], BlockIndex[Succ]);
+ JumpCounts[Edge] = EdgeFreq.getFrequency();
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for |V| = " << F->size()
+ << " with profile = " << F->getFunction().hasProfileData()
+ << " (" << F->getName().str() << ")"
+ << "\n");
+ LLVM_DEBUG(
+ dbgs() << format(" original layout score: %0.2f\n",
+ calcExtTspScore(BlockSizes, BlockCounts, JumpCounts)));
+
+ // Run the layout algorithm.
+ auto NewOrder = applyExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
+ std::vector<const MachineBasicBlock *> NewBlockOrder;
+ NewBlockOrder.reserve(F->size());
+ for (uint64_t Node : NewOrder) {
+ NewBlockOrder.push_back(CurrentBlockOrder[Node]);
+ }
+ LLVM_DEBUG(dbgs() << format(" optimized layout score: %0.2f\n",
+ calcExtTspScore(NewOrder, BlockSizes, BlockCounts,
+ JumpCounts)));
+
+ // Assign new block order.
+ assignBlockOrder(NewBlockOrder);
+}
+
+void MachineBlockPlacement::assignBlockOrder(
+ const std::vector<const MachineBasicBlock *> &NewBlockOrder) {
+ assert(F->size() == NewBlockOrder.size() && "Incorrect size of block order");
+ F->RenumberBlocks();
+
+ bool HasChanges = false;
+ for (size_t I = 0; I < NewBlockOrder.size(); I++) {
+ if (NewBlockOrder[I] != F->getBlockNumbered(I)) {
+ HasChanges = true;
+ break;
+ }
+ }
+ // Stop early if the new block order is identical to the existing one.
+ if (!HasChanges)
+ return;
+
+ SmallVector<MachineBasicBlock *, 4> PrevFallThroughs(F->getNumBlockIDs());
+ for (auto &MBB : *F) {
+ PrevFallThroughs[MBB.getNumber()] = MBB.getFallThrough();
+ }
+
+ // Sort basic blocks in the function according to the computed order.
+ DenseMap<const MachineBasicBlock *, size_t> NewIndex;
+ for (const MachineBasicBlock *MBB : NewBlockOrder) {
+ NewIndex[MBB] = NewIndex.size();
+ }
+ F->sort([&](MachineBasicBlock &L, MachineBasicBlock &R) {
+ return NewIndex[&L] < NewIndex[&R];
+ });
+
+ // Update basic block branches by inserting explicit fallthrough branches
+ // when required and re-optimize branches when possible.
+ const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
+ SmallVector<MachineOperand, 4> Cond;
+ for (auto &MBB : *F) {
+ MachineFunction::iterator NextMBB = std::next(MBB.getIterator());
+ MachineFunction::iterator EndIt = MBB.getParent()->end();
+ auto *FTMBB = PrevFallThroughs[MBB.getNumber()];
+ // If this block had a fallthrough before we need an explicit unconditional
+ // branch to that block if the fallthrough block is not adjacent to the
+ // block in the new order.
+ if (FTMBB && (NextMBB == EndIt || &*NextMBB != FTMBB)) {
+ TII->insertUnconditionalBranch(MBB, FTMBB, MBB.findBranchDebugLoc());
+ }
+
+ // It might be possible to optimize branches by flipping the condition.
+ Cond.clear();
+ MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+ if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
+ continue;
+ MBB.updateTerminator(FTMBB);
+ }
+
+#ifndef NDEBUG
+ // Make sure we correctly constructed all branches.
+ F->verify(this, "After optimized block reordering");
+#endif
+}
+
+void MachineBlockPlacement::createCFGChainExtTsp() {
+ BlockToChain.clear();
+ ComputedEdges.clear();
+ ChainAllocator.DestroyAll();
+
+ MachineBasicBlock *HeadBB = &F->front();
+ BlockChain *FunctionChain =
+ new (ChainAllocator.Allocate()) BlockChain(BlockToChain, HeadBB);
+
+ for (MachineBasicBlock &MBB : *F) {
+ if (HeadBB == &MBB)
+ continue; // Ignore head of the chain
+ FunctionChain->merge(&MBB, nullptr);
+ }
+}
+
namespace {
/// A pass to compute block placement statistics.