aboutsummaryrefslogtreecommitdiff
path: root/bolt/lib/Passes/ReorderAlgorithm.cpp
diff options
context:
space:
mode:
authorRafael Auler <rafaelauler@fb.com>2021-10-08 11:47:10 -0700
committerMaksim Panchenko <maks@fb.com>2021-10-08 11:47:10 -0700
commita34c753fe709a624f5b087397fb05adeac2311e4 (patch)
tree1c784a3d4ed1ad4ecaab64d448843f4346416d92 /bolt/lib/Passes/ReorderAlgorithm.cpp
parent46bc197d72a63ded00d7fce2b891e4324b7bbd9c (diff)
downloadllvm-a34c753fe709a624f5b087397fb05adeac2311e4.zip
llvm-a34c753fe709a624f5b087397fb05adeac2311e4.tar.gz
llvm-a34c753fe709a624f5b087397fb05adeac2311e4.tar.bz2
Rebase: [NFC] Refactor sources to be buildable in shared mode
Summary: Moves source files into separate components, and make explicit component dependency on each other, so LLVM build system knows how to build BOLT in BUILD_SHARED_LIBS=ON. Please use the -c merge.renamelimit=230 git option when rebasing your work on top of this change. To achieve this, we create a new library to hold core IR files (most classes beginning with Binary in their names), a new library to hold Utils, some command line options shared across both RewriteInstance and core IR files, a new library called Rewrite to hold most classes concerned with running top-level functions coordinating the binary rewriting process, and a new library called Profile to hold classes dealing with profile reading and writing. To remove the dependency from BinaryContext into X86-specific classes, we do some refactoring on the BinaryContext constructor to receive a reference to the specific backend directly from RewriteInstance. Then, the dependency on X86 or AArch64-specific classes is transfered to the Rewrite library. We can't have the Core library depend on targets because targets depend on Core (which would create a cycle). Files implementing the entry point of a tool are transferred to the tools/ folder. All header files are transferred to the include/ folder. The src/ folder was renamed to lib/. (cherry picked from FBD32746834)
Diffstat (limited to 'bolt/lib/Passes/ReorderAlgorithm.cpp')
-rw-r--r--bolt/lib/Passes/ReorderAlgorithm.cpp739
1 files changed, 739 insertions, 0 deletions
diff --git a/bolt/lib/Passes/ReorderAlgorithm.cpp b/bolt/lib/Passes/ReorderAlgorithm.cpp
new file mode 100644
index 0000000..1718cae
--- /dev/null
+++ b/bolt/lib/Passes/ReorderAlgorithm.cpp
@@ -0,0 +1,739 @@
+//===--- Passes/ReorderAlgorithm.cpp - Basic block reorderng algorithms ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements different basic block reordering algorithms.
+//
+//===----------------------------------------------------------------------===//
+
+#include "bolt/Passes/ReorderAlgorithm.h"
+#include "bolt/Core/BinaryBasicBlock.h"
+#include "bolt/Core/BinaryFunction.h"
+#include "llvm/Support/CommandLine.h"
+#include <queue>
+#include <random>
+#include <stack>
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "bolt"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+
+extern cl::OptionCategory BoltOptCategory;
+extern cl::opt<bool> NoThreads;
+
+static cl::opt<unsigned> ColdThreshold(
+ "cold-threshold",
+ cl::desc("tenths of percents of main entry frequency to use as a "
+ "threshold when evaluating whether a basic block is cold "
+ "(0 means it is only considered cold if the block has zero "
+ "samples). Default: 0 "),
+ cl::init(0), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+PrintClusters("print-clusters",
+ cl::desc("print clusters"),
+ cl::ZeroOrMore,
+ cl::Hidden,
+ cl::cat(BoltOptCategory));
+
+cl::opt<uint32_t>
+RandomSeed("bolt-seed",
+ cl::desc("seed for randomization"),
+ cl::init(42),
+ cl::ZeroOrMore,
+ cl::Hidden,
+ cl::cat(BoltOptCategory));
+
+} // namespace opts
+
+namespace {
+
+template <class T>
+inline void hashCombine(size_t &Seed, const T &Val) {
+ std::hash<T> Hasher;
+ Seed ^= Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2);
+}
+
+template <typename A, typename B>
+struct HashPair {
+ size_t operator()(const std::pair<A,B>& Val) const {
+ std::hash<A> Hasher;
+ size_t Seed = Hasher(Val.first);
+ hashCombine(Seed, Val.second);
+ return Seed;
+ }
+};
+
+}
+
+void ClusterAlgorithm::computeClusterAverageFrequency(const BinaryContext &BC) {
+ // Create a separate MCCodeEmitter to allow lock-free execution
+ BinaryContext::IndependentCodeEmitter Emitter;
+ if (!opts::NoThreads) {
+ Emitter = BC.createIndependentMCCodeEmitter();
+ }
+
+ AvgFreq.resize(Clusters.size(), 0.0);
+ for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
+ double Freq = 0.0;
+ uint64_t ClusterSize = 0;
+ for (BinaryBasicBlock *BB : Clusters[I]) {
+ if (BB->getNumNonPseudos() > 0) {
+ Freq += BB->getExecutionCount();
+ // Estimate the size of a block in bytes at run time
+ // NOTE: This might be inaccurate
+ ClusterSize += BB->estimateSize(Emitter.MCE.get());
+ }
+ }
+ AvgFreq[I] = ClusterSize == 0 ? 0 : Freq / ClusterSize;
+ }
+}
+
+void ClusterAlgorithm::printClusters() const {
+ for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
+ errs() << "Cluster number " << I;
+ if (AvgFreq.size() == Clusters.size())
+ errs() << " (frequency: " << AvgFreq[I] << ")";
+ errs() << " : ";
+ const char *Sep = "";
+ for (BinaryBasicBlock *BB : Clusters[I]) {
+ errs() << Sep << BB->getName();
+ Sep = ", ";
+ }
+ errs() << "\n";
+ }
+}
+
+void ClusterAlgorithm::reset() {
+ Clusters.clear();
+ ClusterEdges.clear();
+ AvgFreq.clear();
+}
+
+void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const {
+ OS << Src->getName() << " -> " << Dst->getName() << ", count: " << Count;
+}
+
+size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const {
+ HashPair<const BinaryBasicBlock *, const BinaryBasicBlock *> Hasher;
+ return Hasher(std::make_pair(E.Src, E.Dst));
+}
+
+bool GreedyClusterAlgorithm::EdgeEqual::operator()(
+ const EdgeTy &A, const EdgeTy &B) const {
+ return A.Src == B.Src && A.Dst == B.Dst;
+}
+
+void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF,
+ bool ComputeEdges) {
+ reset();
+
+ // Greedy heuristic implementation for the TSP, applied to BB layout. Try to
+ // maximize weight during a path traversing all BBs. In this way, we will
+ // convert the hottest branches into fall-throughs.
+
+ // This is the queue of edges from which we will pop edges and use them to
+ // cluster basic blocks in a greedy fashion.
+ std::vector<EdgeTy> Queue;
+
+ // Initialize inter-cluster weights.
+ if (ComputeEdges)
+ ClusterEdges.resize(BF.layout_size());
+
+ // Initialize clusters and edge queue.
+ for (BinaryBasicBlock *BB : BF.layout()) {
+ // Create a cluster for this BB.
+ uint32_t I = Clusters.size();
+ Clusters.emplace_back();
+ std::vector<BinaryBasicBlock *> &Cluster = Clusters.back();
+ Cluster.push_back(BB);
+ BBToClusterMap[BB] = I;
+ // Populate priority queue with edges.
+ auto BI = BB->branch_info_begin();
+ for (BinaryBasicBlock *&I : BB->successors()) {
+ assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
+ "attempted reordering blocks of function with no profile data");
+ Queue.emplace_back(EdgeTy(BB, I, BI->Count));
+ ++BI;
+ }
+ }
+ // Sort and adjust the edge queue.
+ initQueue(Queue, BF);
+
+ // Grow clusters in a greedy fashion.
+ while (!Queue.empty()) {
+ EdgeTy E = Queue.back();
+ Queue.pop_back();
+
+ const BinaryBasicBlock *SrcBB = E.Src;
+ const BinaryBasicBlock *DstBB = E.Dst;
+
+ LLVM_DEBUG(dbgs() << "Popped edge "; E.print(dbgs()); dbgs() << "\n");
+
+ // Case 1: BBSrc and BBDst are the same. Ignore this edge
+ if (SrcBB == DstBB || DstBB == *BF.layout_begin()) {
+ LLVM_DEBUG(dbgs() << "\tIgnored (same src, dst)\n");
+ continue;
+ }
+
+ int I = BBToClusterMap[SrcBB];
+ int J = BBToClusterMap[DstBB];
+
+ // Case 2: If they are already allocated at the same cluster, just increase
+ // the weight of this cluster
+ if (I == J) {
+ if (ComputeEdges)
+ ClusterEdges[I][I] += E.Count;
+ LLVM_DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n");
+ continue;
+ }
+
+ std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I];
+ std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J];
+ if (areClustersCompatible(ClusterA, ClusterB, E)) {
+ // Case 3: SrcBB is at the end of a cluster and DstBB is at the start,
+ // allowing us to merge two clusters.
+ for (BinaryBasicBlock *BB : ClusterB)
+ BBToClusterMap[BB] = I;
+ ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end());
+ ClusterB.clear();
+ if (ComputeEdges) {
+ // Increase the intra-cluster edge count of cluster A with the count of
+ // this edge as well as with the total count of previously visited edges
+ // from cluster B cluster A.
+ ClusterEdges[I][I] += E.Count;
+ ClusterEdges[I][I] += ClusterEdges[J][I];
+ // Iterate through all inter-cluster edges and transfer edges targeting
+ // cluster B to cluster A.
+ for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K)
+ ClusterEdges[K][I] += ClusterEdges[K][J];
+ }
+ // Adjust the weights of the remaining edges and re-sort the queue.
+ adjustQueue(Queue, BF);
+ LLVM_DEBUG(dbgs() << "\tMerged clusters of src, dst\n");
+ } else {
+ // Case 4: Both SrcBB and DstBB are allocated in positions we cannot
+ // merge them. Add the count of this edge to the inter-cluster edge count
+ // between clusters A and B to help us decide ordering between these
+ // clusters.
+ if (ComputeEdges)
+ ClusterEdges[I][J] += E.Count;
+ LLVM_DEBUG(
+ dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n");
+ }
+ }
+}
+
+void GreedyClusterAlgorithm::reset() {
+ ClusterAlgorithm::reset();
+ BBToClusterMap.clear();
+}
+
+void PHGreedyClusterAlgorithm::initQueue(
+ std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
+ // Define a comparison function to establish SWO between edges.
+ auto Comp = [&BF] (const EdgeTy &A, const EdgeTy &B) {
+ // With equal weights, prioritize branches with lower index
+ // source/destination. This helps to keep original block order for blocks
+ // when optimal order cannot be deducted from a profile.
+ if (A.Count == B.Count) {
+ const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src);
+ return (SrcOrder != 0)
+ ? SrcOrder > 0
+ : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0;
+ }
+ return A.Count < B.Count;
+ };
+
+ // Sort edges in increasing profile count order.
+ std::sort(Queue.begin(), Queue.end(), Comp);
+}
+
+void PHGreedyClusterAlgorithm::adjustQueue(
+ std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
+ // Nothing to do.
+ return;
+}
+
+bool PHGreedyClusterAlgorithm::areClustersCompatible(
+ const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const {
+ return Front.back() == E.Src && Back.front() == E.Dst;
+}
+
+int64_t MinBranchGreedyClusterAlgorithm::calculateWeight(
+ const EdgeTy &E, const BinaryFunction &BF) const {
+ const BinaryBasicBlock *SrcBB = E.Src;
+ const BinaryBasicBlock *DstBB = E.Dst;
+
+ // Initial weight value.
+ int64_t W = (int64_t)E.Count;
+
+ // Adjust the weight by taking into account other edges with the same source.
+ auto BI = SrcBB->branch_info_begin();
+ for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) {
+ assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
+ "attempted reordering blocks of function with no profile data");
+ assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
+ "overflow detected");
+ // Ignore edges with same source and destination, edges that target the
+ // entry block as well as the edge E itself.
+ if (SuccBB != SrcBB && SuccBB != *BF.layout_begin() && SuccBB != DstBB)
+ W -= (int64_t)BI->Count;
+ ++BI;
+ }
+
+ // Adjust the weight by taking into account other edges with the same
+ // destination.
+ for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) {
+ // Ignore edges with same source and destination as well as the edge E
+ // itself.
+ if (PredBB == DstBB || PredBB == SrcBB)
+ continue;
+ auto BI = PredBB->branch_info_begin();
+ for (const BinaryBasicBlock *SuccBB : PredBB->successors()) {
+ if (SuccBB == DstBB)
+ break;
+ ++BI;
+ }
+ assert(BI != PredBB->branch_info_end() && "invalid control flow graph");
+ assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
+ "attempted reordering blocks of function with no profile data");
+ assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
+ "overflow detected");
+ W -= (int64_t)BI->Count;
+ }
+
+ return W;
+}
+
+void MinBranchGreedyClusterAlgorithm::initQueue(
+ std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
+ // Initialize edge weights.
+ for (const EdgeTy &E : Queue)
+ Weight.emplace(std::make_pair(E, calculateWeight(E, BF)));
+
+ // Sort edges in increasing weight order.
+ adjustQueue(Queue, BF);
+}
+
+void MinBranchGreedyClusterAlgorithm::adjustQueue(
+ std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
+ // Define a comparison function to establish SWO between edges.
+ auto Comp = [&] (const EdgeTy &A, const EdgeTy &B) {
+ // With equal weights, prioritize branches with lower index
+ // source/destination. This helps to keep original block order for blocks
+ // when optimal order cannot be deduced from a profile.
+ if (Weight[A] == Weight[B]) {
+ const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src);
+ return (SrcOrder != 0)
+ ? SrcOrder > 0
+ : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0;
+ }
+ return Weight[A] < Weight[B];
+ };
+
+ // Iterate through all remaining edges to find edges that have their
+ // source and destination in the same cluster.
+ std::vector<EdgeTy> NewQueue;
+ for (const EdgeTy &E : Queue) {
+ const BinaryBasicBlock *SrcBB = E.Src;
+ const BinaryBasicBlock *DstBB = E.Dst;
+
+ // Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore
+ // this edge.
+ if (SrcBB == DstBB || DstBB == *BF.layout_begin()) {
+ LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs());
+ dbgs() << " (same src, dst)\n");
+ continue;
+ }
+
+ int I = BBToClusterMap[SrcBB];
+ int J = BBToClusterMap[DstBB];
+ std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I];
+ std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J];
+
+ // Case 2: They are already allocated at the same cluster or incompatible
+ // clusters. Adjust the weights of edges with the same source or
+ // destination, so that this edge has no effect on them any more, and ignore
+ // this edge. Also increase the intra- (or inter-) cluster edge count.
+ if (I == J || !areClustersCompatible(ClusterA, ClusterB, E)) {
+ if (!ClusterEdges.empty())
+ ClusterEdges[I][J] += E.Count;
+ LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs());
+ dbgs() << " (src, dst belong to same cluster or incompatible "
+ "clusters)\n");
+ for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) {
+ if (SuccBB == DstBB)
+ continue;
+ auto WI = Weight.find(EdgeTy(SrcBB, SuccBB, 0));
+ assert(WI != Weight.end() && "CFG edge not found in Weight map");
+ WI->second += (int64_t)E.Count;
+ }
+ for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) {
+ if (PredBB == SrcBB)
+ continue;
+ auto WI = Weight.find(EdgeTy(PredBB, DstBB, 0));
+ assert(WI != Weight.end() && "CFG edge not found in Weight map");
+ WI->second += (int64_t)E.Count;
+ }
+ continue;
+ }
+
+ // Case 3: None of the previous cases is true, so just keep this edge in
+ // the queue.
+ NewQueue.emplace_back(E);
+ }
+
+ // Sort remaining edges in increasing weight order.
+ Queue.swap(NewQueue);
+ std::sort(Queue.begin(), Queue.end(), Comp);
+}
+
+bool MinBranchGreedyClusterAlgorithm::areClustersCompatible(
+ const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const {
+ return Front.back() == E.Src && Back.front() == E.Dst;
+}
+
+void MinBranchGreedyClusterAlgorithm::reset() {
+ GreedyClusterAlgorithm::reset();
+ Weight.clear();
+}
+
+void TSPReorderAlgorithm::reorderBasicBlocks(
+ const BinaryFunction &BF, BasicBlockOrder &Order) const {
+ std::vector<std::vector<uint64_t>> Weight;
+ std::vector<BinaryBasicBlock *> IndexToBB;
+
+ const size_t N = BF.layout_size();
+ assert(N <= std::numeric_limits<uint64_t>::digits &&
+ "cannot use TSP solution for sizes larger than bits in uint64_t");
+
+ // Populating weight map and index map
+ for (BinaryBasicBlock *BB : BF.layout()) {
+ BB->setLayoutIndex(IndexToBB.size());
+ IndexToBB.push_back(BB);
+ }
+ Weight.resize(N);
+ for (BinaryBasicBlock *BB : BF.layout()) {
+ auto BI = BB->branch_info_begin();
+ Weight[BB->getLayoutIndex()].resize(N);
+ for (BinaryBasicBlock *SuccBB : BB->successors()) {
+ if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE)
+ Weight[BB->getLayoutIndex()][SuccBB->getLayoutIndex()] = BI->Count;
+ ++BI;
+ }
+ }
+
+ std::vector<std::vector<int64_t>> DP;
+ DP.resize(1 << N);
+ for (std::vector<int64_t> &Elmt : DP) {
+ Elmt.resize(N, -1);
+ }
+ // Start with the entry basic block being allocated with cost zero
+ DP[1][0] = 0;
+ // Walk through TSP solutions using a bitmask to represent state (current set
+ // of BBs in the layout)
+ uint64_t BestSet = 1;
+ uint64_t BestLast = 0;
+ int64_t BestWeight = 0;
+ for (uint64_t Set = 1; Set < (1ULL << N); ++Set) {
+ // Traverse each possibility of Last BB visited in this layout
+ for (uint64_t Last = 0; Last < N; ++Last) {
+ // Case 1: There is no possible layout with this BB as Last
+ if (DP[Set][Last] == -1)
+ continue;
+
+ // Case 2: There is a layout with this Set and this Last, and we try
+ // to expand this set with New
+ for (uint64_t New = 1; New < N; ++New) {
+ // Case 2a: BB "New" is already in this Set
+ if ((Set & (1ULL << New)) != 0)
+ continue;
+
+ // Case 2b: BB "New" is not in this set and we add it to this Set and
+ // record total weight of this layout with "New" as the last BB.
+ uint64_t NewSet = (Set | (1ULL << New));
+ if (DP[NewSet][New] == -1)
+ DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New];
+ DP[NewSet][New] = std::max(DP[NewSet][New],
+ DP[Set][Last] + (int64_t)Weight[Last][New]);
+
+ if (DP[NewSet][New] > BestWeight) {
+ BestWeight = DP[NewSet][New];
+ BestSet = NewSet;
+ BestLast = New;
+ }
+ }
+ }
+ }
+
+ // Define final function layout based on layout that maximizes weight
+ uint64_t Last = BestLast;
+ uint64_t Set = BestSet;
+ std::vector<bool> Visited;
+ Visited.resize(N);
+ Visited[Last] = true;
+ Order.push_back(IndexToBB[Last]);
+ Set = Set & ~(1ULL << Last);
+ while (Set != 0) {
+ int64_t Best = -1;
+ uint64_t NewLast;
+ for (uint64_t I = 0; I < N; ++I) {
+ if (DP[Set][I] == -1)
+ continue;
+ int64_t AdjWeight = Weight[I][Last] > 0 ? Weight[I][Last] : 0;
+ if (DP[Set][I] + AdjWeight > Best) {
+ NewLast = I;
+ Best = DP[Set][I] + AdjWeight;
+ }
+ }
+ Last = NewLast;
+ Visited[Last] = true;
+ Order.push_back(IndexToBB[Last]);
+ Set = Set & ~(1ULL << Last);
+ }
+ std::reverse(Order.begin(), Order.end());
+
+ // Finalize layout with BBs that weren't assigned to the layout using the
+ // input layout.
+ for (BinaryBasicBlock *BB : BF.layout()) {
+ if (Visited[BB->getLayoutIndex()] == false)
+ Order.push_back(BB);
+ }
+}
+
+void OptimizeReorderAlgorithm::reorderBasicBlocks(
+ const BinaryFunction &BF, BasicBlockOrder &Order) const {
+ if (BF.layout_empty())
+ return;
+
+ // Cluster basic blocks.
+ CAlgo->clusterBasicBlocks(BF);
+
+ if (opts::PrintClusters)
+ CAlgo->printClusters();
+
+ // Arrange basic blocks according to clusters.
+ for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters)
+ Order.insert(Order.end(), Cluster.begin(), Cluster.end());
+}
+
+void OptimizeBranchReorderAlgorithm::reorderBasicBlocks(
+ const BinaryFunction &BF, BasicBlockOrder &Order) const {
+ if (BF.layout_empty())
+ return;
+
+ // Cluster basic blocks.
+ CAlgo->clusterBasicBlocks(BF, /* ComputeEdges = */true);
+ std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
+ std::vector<std::unordered_map<uint32_t, uint64_t>> &ClusterEdges =
+ CAlgo->ClusterEdges;
+
+ // Compute clusters' average frequencies.
+ CAlgo->computeClusterAverageFrequency(BF.getBinaryContext());
+ std::vector<double> &AvgFreq = CAlgo->AvgFreq;
+
+ if (opts::PrintClusters)
+ CAlgo->printClusters();
+
+ // Cluster layout order
+ std::vector<uint32_t> ClusterOrder;
+
+ // Do a topological sort for clusters, prioritizing frequently-executed BBs
+ // during the traversal.
+ std::stack<uint32_t> Stack;
+ std::vector<uint32_t> Status;
+ std::vector<uint32_t> Parent;
+ Status.resize(Clusters.size(), 0);
+ Parent.resize(Clusters.size(), 0);
+ constexpr uint32_t STACKED = 1;
+ constexpr uint32_t VISITED = 2;
+ Status[0] = STACKED;
+ Stack.push(0);
+ while (!Stack.empty()) {
+ uint32_t I = Stack.top();
+ if (!(Status[I] & VISITED)) {
+ Status[I] |= VISITED;
+ // Order successors by weight
+ auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) {
+ return ClusterEdges[I][A] > ClusterEdges[I][B];
+ };
+ std::priority_queue<uint32_t, std::vector<uint32_t>,
+ decltype(ClusterComp)> SuccQueue(ClusterComp);
+ for (std::pair<const uint32_t, uint64_t> &Target : ClusterEdges[I]) {
+ if (Target.second > 0 && !(Status[Target.first] & STACKED) &&
+ !Clusters[Target.first].empty()) {
+ Parent[Target.first] = I;
+ Status[Target.first] = STACKED;
+ SuccQueue.push(Target.first);
+ }
+ }
+ while (!SuccQueue.empty()) {
+ Stack.push(SuccQueue.top());
+ SuccQueue.pop();
+ }
+ continue;
+ }
+ // Already visited this node
+ Stack.pop();
+ ClusterOrder.push_back(I);
+ }
+ std::reverse(ClusterOrder.begin(), ClusterOrder.end());
+ // Put unreachable clusters at the end
+ for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
+ if (!(Status[I] & VISITED) && !Clusters[I].empty())
+ ClusterOrder.push_back(I);
+
+ // Sort nodes with equal precedence
+ auto Beg = ClusterOrder.begin();
+ // Don't reorder the first cluster, which contains the function entry point
+ ++Beg;
+ std::stable_sort(Beg, ClusterOrder.end(),
+ [&AvgFreq, &Parent](uint32_t A, uint32_t B) {
+ uint32_t P = Parent[A];
+ while (Parent[P] != 0) {
+ if (Parent[P] == B)
+ return false;
+ P = Parent[P];
+ }
+ P = Parent[B];
+ while (Parent[P] != 0) {
+ if (Parent[P] == A)
+ return true;
+ P = Parent[P];
+ }
+ return AvgFreq[A] > AvgFreq[B];
+ });
+
+ if (opts::PrintClusters) {
+ errs() << "New cluster order: ";
+ const char *Sep = "";
+ for (uint32_t O : ClusterOrder) {
+ errs() << Sep << O;
+ Sep = ", ";
+ }
+ errs() << '\n';
+ }
+
+ // Arrange basic blocks according to cluster order.
+ for (uint32_t ClusterIndex : ClusterOrder) {
+ ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
+ Order.insert(Order.end(), Cluster.begin(), Cluster.end());
+ }
+}
+
+void OptimizeCacheReorderAlgorithm::reorderBasicBlocks(
+ const BinaryFunction &BF, BasicBlockOrder &Order) const {
+ if (BF.layout_empty())
+ return;
+
+ const uint64_t ColdThreshold =
+ opts::ColdThreshold * (*BF.layout_begin())->getExecutionCount() / 1000;
+
+ // Cluster basic blocks.
+ CAlgo->clusterBasicBlocks(BF);
+ std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
+
+ // Compute clusters' average frequencies.
+ CAlgo->computeClusterAverageFrequency(BF.getBinaryContext());
+ std::vector<double> &AvgFreq = CAlgo->AvgFreq;
+
+ if (opts::PrintClusters)
+ CAlgo->printClusters();
+
+ // Cluster layout order
+ std::vector<uint32_t> ClusterOrder;
+
+ // Order clusters based on average instruction execution frequency
+ for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
+ if (!Clusters[I].empty())
+ ClusterOrder.push_back(I);
+ // Don't reorder the first cluster, which contains the function entry point
+ std::stable_sort(std::next(ClusterOrder.begin()),
+ ClusterOrder.end(),
+ [&AvgFreq](uint32_t A, uint32_t B) {
+ return AvgFreq[A] > AvgFreq[B];
+ });
+
+ if (opts::PrintClusters) {
+ errs() << "New cluster order: ";
+ const char *Sep = "";
+ for (uint32_t O : ClusterOrder) {
+ errs() << Sep << O;
+ Sep = ", ";
+ }
+ errs() << '\n';
+ }
+
+ // Arrange basic blocks according to cluster order.
+ for (uint32_t ClusterIndex : ClusterOrder) {
+ ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
+ Order.insert(Order.end(), Cluster.begin(), Cluster.end());
+ // Force zero execution count on clusters that do not meet the cut off
+ // specified by --cold-threshold.
+ if (AvgFreq[ClusterIndex] < static_cast<double>(ColdThreshold)) {
+ for (BinaryBasicBlock *BBPtr : Cluster) {
+ BBPtr->setExecutionCount(0);
+ }
+ }
+ }
+}
+
+void ReverseReorderAlgorithm::reorderBasicBlocks(
+ const BinaryFunction &BF, BasicBlockOrder &Order) const {
+ if (BF.layout_empty())
+ return;
+
+ BinaryBasicBlock *FirstBB = *BF.layout_begin();
+ Order.push_back(FirstBB);
+ for (auto RLI = BF.layout_rbegin(); *RLI != FirstBB; ++RLI)
+ Order.push_back(*RLI);
+}
+
+void RandomClusterReorderAlgorithm::reorderBasicBlocks(
+ const BinaryFunction &BF, BasicBlockOrder &Order) const {
+ if (BF.layout_empty())
+ return;
+
+ // Cluster basic blocks.
+ CAlgo->clusterBasicBlocks(BF);
+ std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
+
+ if (opts::PrintClusters)
+ CAlgo->printClusters();
+
+ // Cluster layout order
+ std::vector<uint32_t> ClusterOrder;
+
+ // Order clusters based on average instruction execution frequency
+ for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
+ if (!Clusters[I].empty())
+ ClusterOrder.push_back(I);
+
+ std::shuffle(std::next(ClusterOrder.begin()), ClusterOrder.end(),
+ std::default_random_engine(opts::RandomSeed.getValue()));
+
+ if (opts::PrintClusters) {
+ errs() << "New cluster order: ";
+ const char *Sep = "";
+ for (uint32_t O : ClusterOrder) {
+ errs() << Sep << O;
+ Sep = ", ";
+ }
+ errs() << '\n';
+ }
+
+ // Arrange basic blocks according to cluster order.
+ for (uint32_t ClusterIndex : ClusterOrder) {
+ ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
+ Order.insert(Order.end(), Cluster.begin(), Cluster.end());
+ }
+}