aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/CodeGen/MachineBlockPlacement.cpp
diff options
context:
space:
mode:
authorspupyrev <spupyrev@users.noreply.github.com>2024-10-02 10:48:08 -0700
committerGitHub <noreply@github.com>2024-10-02 10:48:08 -0700
commit9016f27c4253ac5c6282c30f0fe08ddd58522cdd (patch)
treeb3be761a89d327b5c1be64206acd1c9f3b4276ba /llvm/lib/CodeGen/MachineBlockPlacement.cpp
parent99f527d2807b5a14dc7ee64d15405f09e95ee9f2 (diff)
downloadllvm-9016f27c4253ac5c6282c30f0fe08ddd58522cdd.zip
llvm-9016f27c4253ac5c6282c30f0fe08ddd58522cdd.tar.gz
llvm-9016f27c4253ac5c6282c30f0fe08ddd58522cdd.tar.bz2
[CodeLayout] Size-aware machine block placement (#109711)
This is an implementation of a new "size-aware" machine block placement. The idea is to reorder blocks so that the number of fall-through jumps is maximized. Observe that profile data is ignored for the optimization, and it is applied only for instances with hasOptSize()=true. This strategy has two benefits: (i) it eliminates jump instructions, which results in smaller text size; (ii) we avoid using profile data while reordering blocks, which yields more "uniform" functions, thus helping ICF and machine outliner/merger. For large (mobile) apps, the size benefits of (i) and (ii) are roughly the same, combined providing up to 0.5% uncompressed and up to 1% compressed savings size on top of the current solution. The optimization is turned off by default.
Diffstat (limited to 'llvm/lib/CodeGen/MachineBlockPlacement.cpp')
-rw-r--r--llvm/lib/CodeGen/MachineBlockPlacement.cpp117
1 files changed, 84 insertions, 33 deletions
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 7807875..c42e632 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -218,6 +218,11 @@ static cl::opt<unsigned> ExtTspBlockPlacementMaxBlocks(
"block placement."),
cl::init(UINT_MAX), cl::Hidden);
+// Apply the ext-tsp algorithm minimizing the size of a binary.
+static cl::opt<bool>
+ ApplyExtTspForSize("apply-ext-tsp-for-size", cl::init(false), cl::Hidden,
+ cl::desc("Use ext-tsp for size-aware block placement."));
+
namespace llvm {
extern cl::opt<bool> EnableExtTspBlockPlacement;
extern cl::opt<bool> ApplyExtTspWithoutProfile;
@@ -595,7 +600,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
void precomputeTriangleChains();
/// Apply a post-processing step optimizing block placement.
- void applyExtTsp();
+ void applyExtTsp(bool OptForSize);
/// Modify the existing block placement in the function and adjust all jumps.
void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder);
@@ -3505,20 +3510,36 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
// Initialize tail duplication thresholds.
initTailDupThreshold();
+ const bool OptForSize =
+ MF.getFunction().hasOptSize() ||
+ llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
+ // Determine whether to use ext-tsp for perf/size optimization. The method
+ // is beneficial only for instances with at least 3 basic blocks and it can be
+ // disabled for huge functions (exceeding a certain size).
+ bool UseExtTspForPerf = false;
+ bool UseExtTspForSize = false;
+ if (3 <= MF.size() && MF.size() <= ExtTspBlockPlacementMaxBlocks) {
+ UseExtTspForPerf =
+ EnableExtTspBlockPlacement &&
+ (ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData());
+ UseExtTspForSize = OptForSize && ApplyExtTspForSize;
+ }
+
// Apply tail duplication.
if (allowTailDupPlacement()) {
MPDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
- bool OptForSize = MF.getFunction().hasOptSize() ||
- llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
if (OptForSize)
TailDupSize = 1;
const bool PreRegAlloc = false;
TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI,
/* LayoutMode */ true, TailDupSize);
- precomputeTriangleChains();
+ if (!UseExtTspForSize)
+ precomputeTriangleChains();
}
- buildCFGChains();
+ // Run the main block placement.
+ if (!UseExtTspForSize)
+ buildCFGChains();
// Changing the layout can create new tail merging opportunities.
// TailMerge can create jump into if branches that make CFG irreducible for
@@ -3545,14 +3566,14 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
}
}
- // Apply a post-processing optimizing block placement.
- if (MF.size() >= 3 && EnableExtTspBlockPlacement &&
- (ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData()) &&
- MF.size() <= ExtTspBlockPlacementMaxBlocks) {
- // Find a new placement and modify the layout of the blocks in the function.
- applyExtTsp();
-
- // Re-create CFG chain so that we can optimizeBranches and alignBlocks.
+ // Apply a post-processing optimizing block placement:
+ // - find a new placement and modify the layout of the blocks in the function;
+ // - re-create CFG chains so that we can optimizeBranches and alignBlocks.
+ if (UseExtTspForPerf || UseExtTspForSize) {
+ assert(
+ !(UseExtTspForPerf && UseExtTspForSize) &&
+ "UseExtTspForPerf and UseExtTspForSize can not be set simultaneosly");
+ applyExtTsp(/*OptForSize=*/UseExtTspForSize);
createCFGChainExtTsp();
}
@@ -3577,7 +3598,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
return true;
}
-void MachineBlockPlacement::applyExtTsp() {
+void MachineBlockPlacement::applyExtTsp(bool OptForSize) {
// Prepare data; blocks are indexed by their index in the current ordering.
DenseMap<const MachineBasicBlock *, uint64_t> BlockIndex;
BlockIndex.reserve(F->size());
@@ -3589,13 +3610,15 @@ void MachineBlockPlacement::applyExtTsp() {
CurrentBlockOrder.push_back(&MBB);
}
- auto BlockSizes = std::vector<uint64_t>(F->size());
- auto BlockCounts = std::vector<uint64_t>(F->size());
- std::vector<codelayout::EdgeCount> JumpCounts;
+ SmallVector<uint64_t, 0> BlockCounts(F->size());
+ SmallVector<uint64_t, 0> BlockSizes(F->size());
+ SmallVector<codelayout::EdgeCount, 0> JumpCounts;
+ SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
+ SmallVector<const MachineBasicBlock *, 4> Succs;
for (MachineBasicBlock &MBB : *F) {
// Getting the block frequency.
BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
- BlockCounts[BlockIndex[&MBB]] = BlockFreq.getFrequency();
+ BlockCounts[BlockIndex[&MBB]] = OptForSize ? 1 : BlockFreq.getFrequency();
// Getting the block size:
// - approximate the size of an instruction by 4 bytes, and
// - ignore debug instructions.
@@ -3604,23 +3627,49 @@ void MachineBlockPlacement::applyExtTsp() {
// not see a perf improvement with the exact block sizes.
auto NonDbgInsts =
instructionsWithoutDebug(MBB.instr_begin(), MBB.instr_end());
- int NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
+ size_t NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts;
+
// Getting jump frequencies.
- for (MachineBasicBlock *Succ : MBB.successors()) {
- auto EP = MBPI->getEdgeProbability(&MBB, Succ);
- BlockFrequency JumpFreq = BlockFreq * EP;
- JumpCounts.push_back(
- {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()});
+ if (OptForSize) {
+ Cond.clear();
+ MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
+ if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
+ continue;
+
+ const MachineBasicBlock *FTB = MBB.getFallThrough();
+ // Succs is a collection of distinct destinations of the block reachable
+ // from MBB via a jump instruction; initialize the list using the three
+ // (non-necessarily distinct) blocks, FTB, TBB, and FBB.
+ Succs.clear();
+ if (TBB && TBB != FTB)
+ Succs.push_back(TBB);
+ if (FBB && FBB != FTB)
+ Succs.push_back(FBB);
+ if (FTB)
+ Succs.push_back(FTB);
+ // Absolute magnitude of non-zero counts does not matter for the
+ // optimization; prioritize slightly jumps with a single successor, since
+ // the corresponding jump instruction will be removed from the binary.
+ const uint64_t Freq = Succs.size() == 1 ? 110 : 100;
+ for (const MachineBasicBlock *Succ : Succs)
+ JumpCounts.push_back({BlockIndex[&MBB], BlockIndex[Succ], Freq});
+ } else {
+ for (MachineBasicBlock *Succ : MBB.successors()) {
+ auto EP = MBPI->getEdgeProbability(&MBB, Succ);
+ BlockFrequency JumpFreq = BlockFreq * EP;
+ JumpCounts.push_back(
+ {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()});
+ }
}
}
LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for |V| = " << F->size()
<< " with profile = " << F->getFunction().hasProfileData()
- << " (" << F->getName().str() << ")"
- << "\n");
- LLVM_DEBUG(dbgs() << format(" original layout score: %0.2f\n",
- calcExtTspScore(BlockSizes, JumpCounts)));
+ << " (" << F->getName() << ")" << "\n");
+
+ const double OrgScore = calcExtTspScore(BlockSizes, JumpCounts);
+ LLVM_DEBUG(dbgs() << format(" original layout score: %0.2f\n", OrgScore));
// Run the layout algorithm.
auto NewOrder = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
@@ -3629,12 +3678,14 @@ void MachineBlockPlacement::applyExtTsp() {
for (uint64_t Node : NewOrder) {
NewBlockOrder.push_back(CurrentBlockOrder[Node]);
}
- LLVM_DEBUG(
- dbgs() << format(" optimized layout score: %0.2f\n",
- calcExtTspScore(NewOrder, BlockSizes, JumpCounts)));
+ const double OptScore = calcExtTspScore(NewOrder, BlockSizes, JumpCounts);
+ LLVM_DEBUG(dbgs() << format(" optimized layout score: %0.2f\n", OptScore));
- // Assign new block order.
- assignBlockOrder(NewBlockOrder);
+ // If the optimization is unsuccessful, fall back to the original block order.
+ if (OptForSize && OrgScore > OptScore)
+ assignBlockOrder(CurrentBlockOrder);
+ else
+ assignBlockOrder(NewBlockOrder);
}
void MachineBlockPlacement::assignBlockOrder(