diff options
author | Amir Ayupov <aaupov@fb.com> | 2025-07-11 21:06:42 -0700 |
---|---|---|
committer | Amir Ayupov <aaupov@fb.com> | 2025-07-11 21:06:42 -0700 |
commit | ab054236f362a3aadf179a571a217c51ece4dd50 (patch) | |
tree | 1669b562ad010889e3e4c74d9be364a3047eb430 | |
parent | 0da9aacf4898fc9debfd930ab3dfbac7084c5e2a (diff) | |
parent | 31279c10f03fd955480a4ec195433d5c676e6593 (diff) | |
download | llvm-users/aaupov/spr/bolt-invoke-terminator.zip llvm-users/aaupov/spr/bolt-invoke-terminator.tar.gz llvm-users/aaupov/spr/bolt-invoke-terminator.tar.bz2 |
[𝘀𝗽𝗿] initial versionusers/aaupov/spr/bolt-invoke-terminator
Created using spr 1.3.4
-rw-r--r-- | bolt/include/bolt/Core/MCPlusBuilder.h | 11 | ||||
-rw-r--r-- | bolt/include/bolt/Profile/DataAggregator.h | 43 | ||||
-rw-r--r-- | bolt/lib/Core/MCPlusBuilder.cpp | 5 | ||||
-rw-r--r-- | bolt/lib/Profile/DataAggregator.cpp | 95 |
4 files changed, 135 insertions, 19 deletions
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index b99cf8b..f902a8c 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -430,6 +430,17 @@ public: return Analysis->isIndirectBranch(Inst); } + /// Returns true if the instruction unconditionally transfers the control to + /// another program point, interrupting sequential code execution, e.g. by a + /// call, return, or unconditional jump. This explicitly leaves out + /// conditional branches as they may not be taken, but does allow transferring + /// the control to the next instruction (zero-displacement jump/call). + bool isUnconditionalControlTransfer(const MCInst &Inst) const { + const MCInstrDesc &Desc = Info->get(Inst.getOpcode()); + // barrier captures returns and unconditional branches + return Desc.isBarrier() || Desc.isCall(); + } + /// Returns true if the instruction is memory indirect call or jump virtual bool isBranchOnMem(const MCInst &Inst) const { llvm_unreachable("not implemented"); diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h index c7400b0..db0f690 100644 --- a/bolt/include/bolt/Profile/DataAggregator.h +++ b/bolt/include/bolt/Profile/DataAggregator.h @@ -137,7 +137,7 @@ private: std::vector<std::pair<Trace, TakenBranchInfo>> Traces; /// Pre-populated addresses of returns, coming from pre-aggregated data or /// disassembly. Used to disambiguate call-continuation fall-throughs. - std::unordered_set<uint64_t> Returns; + std::unordered_map<uint64_t, bool> Returns; std::unordered_map<uint64_t, uint64_t> BasicSamples; std::vector<PerfMemSample> MemSamples; @@ -498,6 +498,10 @@ private: /// If \p FileBuildID has no match, then issue an error and exit. void processFileBuildID(StringRef FileBuildID); + /// Infer missing fall-throughs for branch-only traces (LBR top-of-stack + /// entries). + void imputeFallThroughs(); + /// Debugging dump methods void dump() const; void dump(const PerfBranchSample &Sample) const; @@ -509,6 +513,43 @@ private: void printBasicSamplesDiagnostics(uint64_t OutOfRangeSamples) const; void printBranchStacksDiagnostics(uint64_t IgnoredSamples) const; + /// Get instruction at \p Addr either from containing binary function or + /// disassemble in-place, and invoke \p Callback on resulting MCInst. + /// Returns the result of the callback or nullopt. + template <typename T> + std::optional<T> + testInstructionAt(const uint64_t Addr, + std::function<T(const MCInst &)> Callback) const { + BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr); + if (!Func) + return std::nullopt; + const uint64_t Offset = Addr - Func->getAddress(); + if (Func->hasInstructions()) { + if (auto *MI = Func->getInstructionAtOffset(Offset)) + return Callback(*MI); + } else { + if (auto MI = Func->disassembleInstructionAtOffset(Offset)) + return Callback(*MI); + } + return std::nullopt; + } + + /// Apply \p Callback to the instruction at \p Addr, and memoize the result + /// in a \p Map. + template <typename T> + std::optional<T> testAndSet(const uint64_t Addr, + std::function<T(const MCInst &)> Callback, + std::unordered_map<uint64_t, T> &Map) { + auto It = Map.find(Addr); + if (It != Map.end()) + return It->second; + if (std::optional<T> Res = testInstructionAt<T>(Addr, Callback)) { + Map.emplace(Addr, *Res); + return *Res; + } + return std::nullopt; + } + public: /// If perf.data was collected without build ids, the buildid-list may contain /// incomplete entries. Return true if the buffer containing diff --git a/bolt/lib/Core/MCPlusBuilder.cpp b/bolt/lib/Core/MCPlusBuilder.cpp index fa8f4d1..3c4e2fb 100644 --- a/bolt/lib/Core/MCPlusBuilder.cpp +++ b/bolt/lib/Core/MCPlusBuilder.cpp @@ -34,6 +34,10 @@ cl::opt<bool> TerminalTrap("terminal-trap", cl::desc("Assume that execution stops at trap instruction"), cl::init(true), cl::Hidden, cl::cat(BoltCategory)); +cl::opt<bool> InvokeTerminator("invoke-terminator", + cl::desc("Invoke is a block terminator"), + cl::init(false), cl::Hidden, + cl::cat(BoltCategory)); } bool MCPlusBuilder::equals(const MCInst &A, const MCInst &B, @@ -133,6 +137,7 @@ bool MCPlusBuilder::equals(const MCSpecifierExpr &A, const MCSpecifierExpr &B, bool MCPlusBuilder::isTerminator(const MCInst &Inst) const { return Analysis->isTerminator(Inst) || + (opts::InvokeTerminator && isInvoke(Inst)) || (opts::TerminalTrap && Info->get(Inst.getOpcode()).isTrap()); } diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index c60591c..905728de 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -77,6 +77,11 @@ FilterPID("pid", cl::Optional, cl::cat(AggregatorCategory)); +static cl::opt<bool> ImputeTraceFallthrough( + "impute-trace-fall-through", + cl::desc("impute missing fall-throughs for branch-only traces"), + cl::Optional, cl::cat(AggregatorCategory)); + static cl::opt<bool> IgnoreBuildID("ignore-build-id", cl::desc("continue even if build-ids in input binary and perf.data mismatch"), @@ -513,6 +518,69 @@ void DataAggregator::parsePerfData(BinaryContext &BC) { deleteTempFiles(); } +void DataAggregator::imputeFallThroughs() { + if (Traces.empty()) + return; + + std::pair PrevBranch(Trace::EXTERNAL, Trace::EXTERNAL); + uint64_t AggregateCount = 0; + uint64_t AggregateFallthroughSize = 0; + uint64_t InferredTraces = 0; + + // Helper map with whether the instruction is a call/ret/unconditional branch + std::unordered_map<uint64_t, bool> IsUncondCTMap; + auto checkUnconditionalControlTransfer = [&](const uint64_t Addr) { + auto isUncondCT = [&](const MCInst &MI) -> bool { + return BC->MIB->isUnconditionalControlTransfer(MI); + }; + return testAndSet<bool>(Addr, isUncondCT, IsUncondCTMap).value_or(true); + }; + + // Traces are sorted by their component addresses (Branch, From, To). + // assert(is_sorted(Traces)); + + // Traces corresponding to the top-of-stack branch entry with a missing + // fall-through have BR_ONLY(-1ULL/UINT64_MAX) in To field, meaning that for + // fixed values of Branch and From branch-only traces are stored after all + // traces with valid fall-through. + // + // Group traces by (Branch, From) and compute weighted average fall-through + // length for the top-of-stack trace (closing the group) by accumulating the + // fall-through lengths of traces with valid fall-throughs earlier in the + // group. + for (auto &[Trace, Info] : Traces) { + // Skip fall-throughs in external code. + if (Trace.From == Trace::EXTERNAL) + continue; + std::pair CurrentBranch(Trace.Branch, Trace.From); + // BR_ONLY must be the last trace in the group + if (Trace.To == Trace::BR_ONLY) { + // If the group is not empty, use aggregate values, otherwise 0-length + // for unconditional jumps (call/ret/uncond branch) or 1-length for others + uint64_t InferredBytes = + PrevBranch == CurrentBranch + ? AggregateFallthroughSize / AggregateCount + : !checkUnconditionalControlTransfer(Trace.From); + Trace.To = Trace.From + InferredBytes; + LLVM_DEBUG(dbgs() << "imputed " << Trace << " (" << InferredBytes + << " bytes)\n"); + ++InferredTraces; + } else { + // Trace with a valid fall-through + // New group: reset aggregates. + if (CurrentBranch != PrevBranch) + AggregateCount = AggregateFallthroughSize = 0; + // Only use valid fall-through lengths + if (Trace.To != Trace::EXTERNAL) + AggregateFallthroughSize += (Trace.To - Trace.From) * Info.TakenCount; + AggregateCount += Info.TakenCount; + } + PrevBranch = CurrentBranch; + } + if (opts::Verbosity >= 1) + outs() << "BOLT-INFO: imputed " << InferredTraces << " traces\n"; +} + Error DataAggregator::preprocessProfile(BinaryContext &BC) { this->BC = &BC; @@ -525,6 +593,9 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) { // Sort parsed traces for faster processing. llvm::sort(Traces, llvm::less_first()); + if (opts::ImputeTraceFallthrough) + imputeFallThroughs(); + if (opts::HeatmapMode) { if (std::error_code EC = printLBRHeatMap()) return errorCodeToError(EC); @@ -726,22 +797,10 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc, } bool DataAggregator::checkReturn(uint64_t Addr) { - auto isReturn = [&](auto MI) { return MI && BC->MIB->isReturn(*MI); }; - if (llvm::is_contained(Returns, Addr)) - return true; - - BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr); - if (!Func) - return false; - - const uint64_t Offset = Addr - Func->getAddress(); - if (Func->hasInstructions() - ? isReturn(Func->getInstructionAtOffset(Offset)) - : isReturn(Func->disassembleInstructionAtOffset(Offset))) { - Returns.emplace(Addr); - return true; - } - return false; + auto isReturn = [&](const MCInst &MI) -> bool { + return BC->MIB->isReturn(MI); + }; + return testAndSet<bool>(Addr, isReturn, Returns).value_or(false); } bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count, @@ -1331,7 +1390,7 @@ std::error_code DataAggregator::parseAggregatedLBREntry() { if (!Addr[0]->Offset) Addr[0]->Offset = Trace::FT_EXTERNAL_RETURN; else - Returns.emplace(Addr[0]->Offset); + Returns.emplace(Addr[0]->Offset, true); } /// Record a trace. @@ -1592,7 +1651,7 @@ void DataAggregator::processBranchEvents() { NamedRegionTimer T("processBranch", "Processing branch events", TimerGroupName, TimerGroupDesc, opts::TimeAggregator); - Returns.emplace(Trace::FT_EXTERNAL_RETURN); + Returns.emplace(Trace::FT_EXTERNAL_RETURN, true); for (const auto &[Trace, Info] : Traces) { bool IsReturn = checkReturn(Trace.Branch); // Ignore returns. |