diff options
author | Amir Ayupov <aaupov@fb.com> | 2025-05-08 16:10:24 -0700 |
---|---|---|
committer | Amir Ayupov <aaupov@fb.com> | 2025-05-08 16:10:24 -0700 |
commit | ced7a9341c60a4a5d39a9c282306106b9569c83f (patch) | |
tree | 0bdb1e74bf0e433983f6fcaefa86632c2dc37d8b | |
parent | 54aa16d2934f0d57184ab203bc3a0f534501f508 (diff) | |
download | llvm-users/aaupov/spr/main.bolttest-add-pseudo-probe-split-functest.zip llvm-users/aaupov/spr/main.bolttest-add-pseudo-probe-split-functest.tar.gz llvm-users/aaupov/spr/main.bolttest-add-pseudo-probe-split-functest.tar.bz2 |
[𝘀𝗽𝗿] changes to main this commit is based onusers/aaupov/spr/main.bolttest-add-pseudo-probe-split-functest
Created using spr 1.3.4
[skip ci]
-rw-r--r-- | bolt/include/bolt/Core/BinaryFunction.h | 15 | ||||
-rw-r--r-- | bolt/include/bolt/Profile/DataAggregator.h | 20 | ||||
-rw-r--r-- | bolt/include/bolt/Profile/DataReader.h | 3 | ||||
-rw-r--r-- | bolt/include/bolt/Profile/Heatmap.h | 22 | ||||
-rw-r--r-- | bolt/lib/Core/BinaryFunction.cpp | 2 | ||||
-rw-r--r-- | bolt/lib/Passes/BinaryPasses.cpp | 2 | ||||
-rw-r--r-- | bolt/lib/Profile/DataAggregator.cpp | 252 | ||||
-rw-r--r-- | bolt/lib/Profile/DataReader.cpp | 11 | ||||
-rw-r--r-- | bolt/lib/Profile/Heatmap.cpp | 78 | ||||
-rw-r--r-- | bolt/lib/Profile/YAMLProfileReader.cpp | 6 | ||||
-rw-r--r-- | bolt/test/X86/heatmap-preagg.test | 35 | ||||
-rw-r--r-- | bolt/test/perf2bolt/perf_test.test | 1 |
12 files changed, 231 insertions, 216 deletions
diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h index a529985..e82b857 100644 --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -386,8 +386,8 @@ private: /// Profile match ratio. float ProfileMatchRatio{0.0f}; - /// Raw branch count for this function in the profile. - uint64_t RawBranchCount{0}; + /// Raw sample/branch count for this function in the profile. + uint64_t RawSampleCount{0}; /// Dynamically executed function bytes, used for density computation. uint64_t SampleCountInBytes{0}; @@ -1880,13 +1880,12 @@ public: /// Return COUNT_NO_PROFILE if there's no profile info. uint64_t getExecutionCount() const { return ExecutionCount; } - /// Return the raw profile information about the number of branch - /// executions corresponding to this function. - uint64_t getRawBranchCount() const { return RawBranchCount; } + /// Return the raw profile information about the number of samples (basic + /// profile) or branch executions (branch profile) recorded in this function. + uint64_t getRawSampleCount() const { return RawSampleCount; } - /// Set the profile data about the number of branch executions corresponding - /// to this function. - void setRawBranchCount(uint64_t Count) { RawBranchCount = Count; } + /// Set raw count of samples or branches recorded in this function. + void setRawSampleCount(uint64_t Count) { RawSampleCount = Count; } /// Return the number of dynamically executed bytes, from raw perf data. uint64_t getSampleCountInBytes() const { return SampleCountInBytes; } diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h index c4ee75e..d66d198 100644 --- a/bolt/include/bolt/Profile/DataAggregator.h +++ b/bolt/include/bolt/Profile/DataAggregator.h @@ -92,16 +92,6 @@ private: uint64_t Addr; }; - /// Used for parsing specific pre-aggregated input files. - struct AggregatedLBREntry { - enum Type : char { BRANCH = 0, FT, FT_EXTERNAL_ORIGIN, TRACE }; - Location From; - Location To; - uint64_t Count; - uint64_t Mispreds; - Type EntryType; - }; - struct Trace { uint64_t From; uint64_t To; @@ -131,7 +121,6 @@ private: /// and use them later for processing and assigning profile. std::unordered_map<Trace, TakenBranchInfo, TraceHash> BranchLBRs; std::unordered_map<Trace, FTInfo, TraceHash> FallthroughLBRs; - std::vector<AggregatedLBREntry> AggregatedLBRs; std::unordered_map<uint64_t, uint64_t> BasicSamples; std::vector<PerfMemSample> MemSamples; @@ -416,14 +405,7 @@ private: /// F 41be90 41be90 4 /// B 4b1942 39b57f0 3 0 /// B 4b196f 4b19e0 2 0 - void parsePreAggregated(); - - /// Parse the full output of pre-aggregated LBR samples generated by - /// an external tool. - std::error_code parsePreAggregatedLBRSamples(); - - /// Process parsed pre-aggregated data. - void processPreAggregated(); + std::error_code parsePreAggregated(); /// If \p Address falls into the binary address space based on memory /// mapping info \p MMI, then adjust it for further processing by subtracting diff --git a/bolt/include/bolt/Profile/DataReader.h b/bolt/include/bolt/Profile/DataReader.h index 314dcc9..a7a0933 100644 --- a/bolt/include/bolt/Profile/DataReader.h +++ b/bolt/include/bolt/Profile/DataReader.h @@ -252,6 +252,9 @@ struct FuncSampleData { /// Get the number of samples recorded in [Start, End) uint64_t getSamples(uint64_t Start, uint64_t End) const; + /// Returns the total number of samples recorded in this function. + uint64_t getSamples() const; + /// Aggregation helper DenseMap<uint64_t, size_t> Index; diff --git a/bolt/include/bolt/Profile/Heatmap.h b/bolt/include/bolt/Profile/Heatmap.h index 74d7eed..a63b221 100644 --- a/bolt/include/bolt/Profile/Heatmap.h +++ b/bolt/include/bolt/Profile/Heatmap.h @@ -9,6 +9,7 @@ #ifndef BOLT_PROFILE_HEATMAP_H #define BOLT_PROFILE_HEATMAP_H +#include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include <cstdint> #include <map> @@ -57,9 +58,9 @@ public: } /// Register a single sample at \p Address. - void registerAddress(uint64_t Address) { + void registerAddress(uint64_t Address, uint64_t Count) { if (!ignoreAddress(Address)) - ++Map[Address / BucketSize]; + Map[Address / BucketSize] += Count; } /// Register \p Count samples at [\p StartAddress, \p EndAddress ]. @@ -77,9 +78,22 @@ public: void printCDF(raw_ostream &OS) const; - void printSectionHotness(StringRef Filename) const; + /// Struct describing individual section hotness. + struct SectionStats { + uint64_t Samples{0}; + uint64_t Buckets{0}; + }; - void printSectionHotness(raw_ostream &OS) const; + /// Mapping from section name to associated \p SectionStats. Special entries: + /// - [total] for total stats, + /// - [unmapped] for samples outside any section, if non-zero. + using SectionStatsMap = StringMap<SectionStats>; + + SectionStatsMap computeSectionStats() const; + + void printSectionHotness(const SectionStatsMap &, StringRef Filename) const; + + void printSectionHotness(const SectionStatsMap &, raw_ostream &OS) const; size_t size() const { return Map.size(); } }; diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 9773e21..fc521dc 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -473,7 +473,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation) { OS << "\n Image : 0x" << Twine::utohexstr(getImageAddress()); if (ExecutionCount != COUNT_NO_PROFILE) { OS << "\n Exec Count : " << ExecutionCount; - OS << "\n Branch Count: " << RawBranchCount; + OS << "\n Branch Count: " << RawSampleCount; OS << "\n Profile Acc : " << format("%.1f%%", ProfileMatchRatio * 100.0f); } diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index d8628c6..420ffc8 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -1445,7 +1445,7 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) { if (!Function.hasProfile()) continue; - uint64_t SampleCount = Function.getRawBranchCount(); + uint64_t SampleCount = Function.getRawSampleCount(); TotalSampleCount += SampleCount; if (Function.hasValidProfile()) { diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index 80f4ea0..11850fa 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -349,25 +349,29 @@ bool DataAggregator::checkPerfDataMagic(StringRef FileName) { return false; } -void DataAggregator::parsePreAggregated() { - std::string Error; +std::error_code DataAggregator::parsePreAggregated() { + outs() << "PERF2BOLT: parsing pre-aggregated profile...\n"; + NamedRegionTimer T("parseAggregated", "Parsing aggregated branch events", + TimerGroupName, TimerGroupDesc, opts::TimeAggregator); ErrorOr<std::unique_ptr<MemoryBuffer>> MB = MemoryBuffer::getFileOrSTDIN(Filename); - if (std::error_code EC = MB.getError()) { - errs() << "PERF2BOLT-ERROR: cannot open " << Filename << ": " - << EC.message() << "\n"; - exit(1); - } + if (std::error_code EC = MB.getError()) + return EC; FileBuf = std::move(*MB); ParsingBuf = FileBuf->getBuffer(); Col = 0; Line = 1; - if (parsePreAggregatedLBRSamples()) { - errs() << "PERF2BOLT: failed to parse samples\n"; - exit(1); + size_t AggregatedLBRs = 0; + while (hasData()) { + if (std::error_code EC = parseAggregatedLBREntry()) + return EC; + ++AggregatedLBRs; } + + outs() << "PERF2BOLT: read " << AggregatedLBRs << " aggregated LBR entries\n"; + return std::error_code(); } void DataAggregator::filterBinaryMMapInfo() { @@ -446,11 +450,6 @@ int DataAggregator::prepareToParse(StringRef Name, PerfProcessInfo &Process, Error DataAggregator::preprocessProfile(BinaryContext &BC) { this->BC = &BC; - if (opts::ReadPreAggregated) { - parsePreAggregated(); - return Error::success(); - } - if (std::optional<StringRef> FileBuildID = BC.getFileBuildID()) { outs() << "BOLT-INFO: binary build-id is: " << *FileBuildID << "\n"; processFileBuildID(*FileBuildID); @@ -471,6 +470,12 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) { ErrorCallback(ReturnCode, ErrBuf); }; + if (opts::ReadPreAggregated) { + if (std::error_code EC = parsePreAggregated()) + return errorCodeToError(EC); + goto heatmap; + } + if (BC.IsLinuxKernel) { // Current MMap parsing logic does not work with linux kernel. // MMap entries for linux kernel uses PERF_RECORD_MMAP @@ -499,16 +504,7 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) { filterBinaryMMapInfo(); prepareToParse("events", MainEventsPPI, ErrorCallback); - if (opts::HeatmapMode) { - if (std::error_code EC = printLBRHeatMap()) { - errs() << "ERROR: failed to print heat map: " << EC.message() << '\n'; - exit(1); - } - exit(0); - } - - if ((!opts::BasicAggregation && parseBranchEvents()) || - (opts::BasicAggregation && parseBasicEvents())) + if (opts::BasicAggregation ? parseBasicEvents() : parseBranchEvents()) errs() << "PERF2BOLT: failed to parse samples\n"; // Special handling for memory events @@ -521,6 +517,13 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) { deleteTempFiles(); +heatmap: + if (opts::HeatmapMode) { + if (std::error_code EC = printLBRHeatMap()) + return errorCodeToError(EC); + exit(0); + } + return Error::success(); } @@ -557,9 +560,7 @@ bool DataAggregator::mayHaveProfileData(const BinaryFunction &Function) { } void DataAggregator::processProfile(BinaryContext &BC) { - if (opts::ReadPreAggregated) - processPreAggregated(); - else if (opts::BasicAggregation) + if (opts::BasicAggregation) processBasicEvents(); else processBranchEvents(); @@ -567,15 +568,14 @@ void DataAggregator::processProfile(BinaryContext &BC) { processMemEvents(); // Mark all functions with registered events as having a valid profile. - const auto Flags = opts::BasicAggregation ? BinaryFunction::PF_SAMPLE - : BinaryFunction::PF_LBR; for (auto &BFI : BC.getBinaryFunctions()) { BinaryFunction &BF = BFI.second; - FuncBranchData *FBD = getBranchData(BF); - if (FBD || getFuncSampleData(BF.getNames())) { - BF.markProfiled(Flags); - if (FBD) - BF.RawBranchCount = FBD->getNumExecutedBranches(); + if (FuncBranchData *FBD = getBranchData(BF)) { + BF.markProfiled(BinaryFunction::PF_LBR); + BF.RawSampleCount = FBD->getNumExecutedBranches(); + } else if (FuncSampleData *FSD = getFuncSampleData(BF.getNames())) { + BF.markProfiled(BinaryFunction::PF_SAMPLE); + BF.RawSampleCount = FSD->getSamples(); } } @@ -588,7 +588,6 @@ void DataAggregator::processProfile(BinaryContext &BC) { // Release intermediate storage. clear(BranchLBRs); clear(FallthroughLBRs); - clear(AggregatedLBRs); clear(BasicSamples); clear(MemSamples); } @@ -632,10 +631,18 @@ StringRef DataAggregator::getLocationName(const BinaryFunction &Func, bool DataAggregator::doSample(BinaryFunction &OrigFunc, uint64_t Address, uint64_t Count) { + // To record executed bytes, use basic block size as is regardless of BAT. + uint64_t BlockSize = 0; + if (BinaryBasicBlock *BB = OrigFunc.getBasicBlockContainingOffset( + Address - OrigFunc.getAddress())) + BlockSize = BB->getOriginalSize(); + BinaryFunction *ParentFunc = getBATParentFunction(OrigFunc); BinaryFunction &Func = ParentFunc ? *ParentFunc : OrigFunc; - if (ParentFunc || (BAT && !BAT->isBATFunction(OrigFunc.getAddress()))) + if (ParentFunc || (BAT && !BAT->isBATFunction(Func.getAddress()))) NumColdSamples += Count; + // Attach executed bytes to parent function in case of cold fragment. + Func.SampleCountInBytes += Count * BlockSize; auto I = NamesToSamples.find(Func.getOneName()); if (I == NamesToSamples.end()) { @@ -1209,15 +1216,14 @@ std::error_code DataAggregator::parseAggregatedLBREntry() { ErrorOr<StringRef> TypeOrErr = parseString(FieldSeparator); if (std::error_code EC = TypeOrErr.getError()) return EC; - auto Type = AggregatedLBREntry::TRACE; - if (LLVM_LIKELY(TypeOrErr.get() == "T")) { - } else if (TypeOrErr.get() == "B") { - Type = AggregatedLBREntry::BRANCH; - } else if (TypeOrErr.get() == "F") { - Type = AggregatedLBREntry::FT; - } else if (TypeOrErr.get() == "f") { - Type = AggregatedLBREntry::FT_EXTERNAL_ORIGIN; - } else { + enum TType { TRACE, BRANCH, FT, FT_EXTERNAL_ORIGIN, INVALID }; + auto Type = StringSwitch<TType>(TypeOrErr.get()) + .Case("T", TRACE) + .Case("B", BRANCH) + .Case("F", FT) + .Case("f", FT_EXTERNAL_ORIGIN) + .Default(INVALID); + if (Type == INVALID) { reportError("expected T, B, F or f"); return make_error_code(llvm::errc::io_error); } @@ -1235,7 +1241,7 @@ std::error_code DataAggregator::parseAggregatedLBREntry() { return EC; ErrorOr<Location> TraceFtEnd = std::error_code(); - if (Type == AggregatedLBREntry::TRACE) { + if (Type == TRACE) { while (checkAndConsumeFS()) { } TraceFtEnd = parseLocationOrOffset(); @@ -1245,13 +1251,12 @@ std::error_code DataAggregator::parseAggregatedLBREntry() { while (checkAndConsumeFS()) { } - ErrorOr<int64_t> Frequency = - parseNumberField(FieldSeparator, Type != AggregatedLBREntry::BRANCH); + ErrorOr<int64_t> Frequency = parseNumberField(FieldSeparator, Type != BRANCH); if (std::error_code EC = Frequency.getError()) return EC; uint64_t Mispreds = 0; - if (Type == AggregatedLBREntry::BRANCH) { + if (Type == BRANCH) { while (checkAndConsumeFS()) { } ErrorOr<int64_t> MispredsOrErr = parseNumberField(FieldSeparator, true); @@ -1273,13 +1278,28 @@ std::error_code DataAggregator::parseAggregatedLBREntry() { BF->setHasProfileAvailable(); uint64_t Count = static_cast<uint64_t>(Frequency.get()); - AggregatedLBREntry Entry{From.get(), To.get(), Count, Mispreds, Type}; - AggregatedLBRs.emplace_back(Entry); - if (Type == AggregatedLBREntry::TRACE) { - auto FtType = (FromFunc == ToFunc) ? AggregatedLBREntry::FT - : AggregatedLBREntry::FT_EXTERNAL_ORIGIN; - AggregatedLBREntry TraceFt{To.get(), TraceFtEnd.get(), Count, 0, FtType}; - AggregatedLBRs.emplace_back(TraceFt); + + Trace Trace(From->Offset, To->Offset); + // Taken trace + if (Type == TRACE || Type == BRANCH) { + TakenBranchInfo &Info = BranchLBRs[Trace]; + Info.TakenCount += Count; + Info.MispredCount += Mispreds; + + NumTotalSamples += Count; + } + // Construct fallthrough part of the trace + if (Type == TRACE) { + Trace.From = To->Offset; + Trace.To = TraceFtEnd->Offset; + Type = FromFunc == ToFunc ? FT : FT_EXTERNAL_ORIGIN; + } + // Add fallthrough trace + if (Type != BRANCH) { + FTInfo &Info = FallthroughLBRs[Trace]; + (Type == FT ? Info.InternCount : Info.ExternCount) += Count; + + NumTraces += Count; } return std::error_code(); @@ -1301,53 +1321,6 @@ std::error_code DataAggregator::printLBRHeatMap() { } Heatmap HM(opts::HeatmapBlock, opts::HeatmapMinAddress, opts::HeatmapMaxAddress, getTextSections(BC)); - uint64_t NumTotalSamples = 0; - - if (opts::BasicAggregation) { - while (hasData()) { - ErrorOr<PerfBasicSample> SampleRes = parseBasicSample(); - if (std::error_code EC = SampleRes.getError()) { - if (EC == errc::no_such_process) - continue; - return EC; - } - PerfBasicSample &Sample = SampleRes.get(); - HM.registerAddress(Sample.PC); - NumTotalSamples++; - } - outs() << "HEATMAP: read " << NumTotalSamples << " basic samples\n"; - } else { - while (hasData()) { - ErrorOr<PerfBranchSample> SampleRes = parseBranchSample(); - if (std::error_code EC = SampleRes.getError()) { - if (EC == errc::no_such_process) - continue; - return EC; - } - - PerfBranchSample &Sample = SampleRes.get(); - - // LBRs are stored in reverse execution order. NextLBR refers to the next - // executed branch record. - const LBREntry *NextLBR = nullptr; - for (const LBREntry &LBR : Sample.LBR) { - if (NextLBR) { - // Record fall-through trace. - const uint64_t TraceFrom = LBR.To; - const uint64_t TraceTo = NextLBR->From; - ++FallthroughLBRs[Trace(TraceFrom, TraceTo)].InternCount; - } - NextLBR = &LBR; - } - if (!Sample.LBR.empty()) { - HM.registerAddress(Sample.LBR.front().To); - HM.registerAddress(Sample.LBR.back().From); - } - NumTotalSamples += Sample.LBR.size(); - } - outs() << "HEATMAP: read " << NumTotalSamples << " LBR samples\n"; - outs() << "HEATMAP: " << FallthroughLBRs.size() << " unique traces\n"; - } if (!NumTotalSamples) { if (opts::BasicAggregation) { @@ -1363,6 +1336,8 @@ std::error_code DataAggregator::printLBRHeatMap() { outs() << "HEATMAP: building heat map...\n"; + for (const auto &[PC, Hits] : BasicSamples) + HM.registerAddress(PC, Hits); for (const auto &LBR : FallthroughLBRs) { const Trace &Trace = LBR.first; const FTInfo &Info = LBR.second; @@ -1382,10 +1357,12 @@ std::error_code DataAggregator::printLBRHeatMap() { HM.printCDF(opts::OutputFilename); else HM.printCDF(opts::OutputFilename + ".csv"); + Heatmap::SectionStatsMap Stats = HM.computeSectionStats(); if (opts::OutputFilename == "-") - HM.printSectionHotness(opts::OutputFilename); + HM.printSectionHotness(Stats, opts::OutputFilename); else - HM.printSectionHotness(opts::OutputFilename + "-section-hotness.csv"); + HM.printSectionHotness(Stats, + opts::OutputFilename + "-section-hotness.csv"); return std::error_code(); } @@ -1412,7 +1389,10 @@ void DataAggregator::parseLBRSample(const PerfBranchSample &Sample, const uint64_t TraceTo = NextLBR->From; const BinaryFunction *TraceBF = getBinaryFunctionContainingAddress(TraceFrom); - if (TraceBF && TraceBF->containsAddress(TraceTo)) { + if (opts::HeatmapMode) { + FTInfo &Info = FallthroughLBRs[Trace(TraceFrom, TraceTo)]; + ++Info.InternCount; + } else if (TraceBF && TraceBF->containsAddress(TraceTo)) { FTInfo &Info = FallthroughLBRs[Trace(TraceFrom, TraceTo)]; if (TraceBF->containsAddress(LBR.From)) ++Info.InternCount; @@ -1446,6 +1426,11 @@ void DataAggregator::parseLBRSample(const PerfBranchSample &Sample, } NextLBR = &LBR; + if (opts::HeatmapMode) { + TakenBranchInfo &Info = BranchLBRs[Trace(LBR.From, LBR.To)]; + ++Info.TakenCount; + continue; + } uint64_t From = getBinaryFunctionContainingAddress(LBR.From) ? LBR.From : 0; uint64_t To = getBinaryFunctionContainingAddress(LBR.To) ? LBR.To : 0; if (!From && !To) @@ -1454,6 +1439,10 @@ void DataAggregator::parseLBRSample(const PerfBranchSample &Sample, ++Info.TakenCount; Info.MispredCount += LBR.Mispred; } + if (opts::HeatmapMode && !Sample.LBR.empty()) { + ++BasicSamples[Sample.LBR.front().To]; + ++BasicSamples[Sample.LBR.back().From]; + } } void DataAggregator::printColdSamplesDiagnostic() const { @@ -1589,7 +1578,6 @@ std::error_code DataAggregator::parseBranchEvents() { printBranchStacksDiagnostics(NumTotalSamples - NumSamples); } } - printBranchSamplesDiagnostics(); return std::error_code(); } @@ -1617,6 +1605,7 @@ void DataAggregator::processBranchEvents() { const TakenBranchInfo &Info = AggrLBR.second; doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount); } + printBranchSamplesDiagnostics(); } std::error_code DataAggregator::parseBasicEvents() { @@ -1630,6 +1619,7 @@ std::error_code DataAggregator::parseBasicEvents() { if (!Sample->PC) continue; + ++NumTotalSamples; if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Sample->PC)) BF->setHasProfileAvailable(); @@ -1637,6 +1627,7 @@ std::error_code DataAggregator::parseBasicEvents() { ++BasicSamples[Sample->PC]; EventNames.insert(Sample->EventName); } + outs() << "PERF2BOLT: read " << NumTotalSamples << " basic samples\n"; return std::error_code(); } @@ -1649,7 +1640,6 @@ void DataAggregator::processBasicEvents() { for (auto &Sample : BasicSamples) { const uint64_t PC = Sample.first; const uint64_t HitCount = Sample.second; - NumTotalSamples += HitCount; BinaryFunction *Func = getBinaryFunctionContainingAddress(PC); if (!Func) { OutOfRangeSamples += HitCount; @@ -1658,7 +1648,6 @@ void DataAggregator::processBasicEvents() { doSample(*Func, PC, HitCount); } - outs() << "PERF2BOLT: read " << NumTotalSamples << " samples\n"; printBasicSamplesDiagnostics(OutOfRangeSamples); } @@ -1722,49 +1711,6 @@ void DataAggregator::processMemEvents() { } } -std::error_code DataAggregator::parsePreAggregatedLBRSamples() { - outs() << "PERF2BOLT: parsing pre-aggregated profile...\n"; - NamedRegionTimer T("parseAggregated", "Parsing aggregated branch events", - TimerGroupName, TimerGroupDesc, opts::TimeAggregator); - while (hasData()) - if (std::error_code EC = parseAggregatedLBREntry()) - return EC; - - return std::error_code(); -} - -void DataAggregator::processPreAggregated() { - outs() << "PERF2BOLT: processing pre-aggregated profile...\n"; - NamedRegionTimer T("processAggregated", "Processing aggregated branch events", - TimerGroupName, TimerGroupDesc, opts::TimeAggregator); - - for (const AggregatedLBREntry &AggrEntry : AggregatedLBRs) { - switch (AggrEntry.EntryType) { - case AggregatedLBREntry::BRANCH: - case AggregatedLBREntry::TRACE: - doBranch(AggrEntry.From.Offset, AggrEntry.To.Offset, AggrEntry.Count, - AggrEntry.Mispreds); - NumTotalSamples += AggrEntry.Count; - break; - case AggregatedLBREntry::FT: - case AggregatedLBREntry::FT_EXTERNAL_ORIGIN: { - LBREntry First{AggrEntry.EntryType == AggregatedLBREntry::FT - ? AggrEntry.From.Offset - : 0, - AggrEntry.From.Offset, false}; - LBREntry Second{AggrEntry.To.Offset, AggrEntry.To.Offset, false}; - doTrace(First, Second, AggrEntry.Count); - NumTraces += AggrEntry.Count; - break; - } - } - } - - outs() << "PERF2BOLT: read " << AggregatedLBRs.size() - << " aggregated LBR entries\n"; - printBranchSamplesDiagnostics(); -} - std::optional<int32_t> DataAggregator::parseCommExecEvent() { size_t LineEnd = ParsingBuf.find_first_of("\n"); if (LineEnd == StringRef::npos) { diff --git a/bolt/lib/Profile/DataReader.cpp b/bolt/lib/Profile/DataReader.cpp index f2e999b..4a92c9e 100644 --- a/bolt/lib/Profile/DataReader.cpp +++ b/bolt/lib/Profile/DataReader.cpp @@ -128,6 +128,13 @@ uint64_t FuncSampleData::getSamples(uint64_t Start, uint64_t End) const { return Result; } +uint64_t FuncSampleData::getSamples() const { + uint64_t Result = 0; + for (const SampleInfo &I : Data) + Result += I.Hits; + return Result; +} + void FuncSampleData::bumpCount(uint64_t Offset, uint64_t Count) { auto Iter = Index.find(Offset); if (Iter == Index.end()) { @@ -407,12 +414,12 @@ void DataReader::matchProfileData(BinaryFunction &BF) { FuncBranchData *FBD = getBranchData(BF); if (FBD) { BF.ProfileMatchRatio = evaluateProfileData(BF, *FBD); - BF.RawBranchCount = FBD->getNumExecutedBranches(); + BF.RawSampleCount = FBD->getNumExecutedBranches(); if (BF.ProfileMatchRatio == 1.0f) { if (fetchProfileForOtherEntryPoints(BF)) { BF.ProfileMatchRatio = evaluateProfileData(BF, *FBD); BF.ExecutionCount = FBD->ExecutionCount; - BF.RawBranchCount = FBD->getNumExecutedBranches(); + BF.RawSampleCount = FBD->getNumExecutedBranches(); } return; } diff --git a/bolt/lib/Profile/Heatmap.cpp b/bolt/lib/Profile/Heatmap.cpp index 5fc3e06..067c96c 100644 --- a/bolt/lib/Profile/Heatmap.cpp +++ b/bolt/lib/Profile/Heatmap.cpp @@ -284,64 +284,92 @@ void Heatmap::printCDF(raw_ostream &OS) const { Counts.clear(); } -void Heatmap::printSectionHotness(StringRef FileName) const { +void Heatmap::printSectionHotness(const Heatmap::SectionStatsMap &Stats, + StringRef FileName) const { std::error_code EC; raw_fd_ostream OS(FileName, EC, sys::fs::OpenFlags::OF_None); if (EC) { errs() << "error opening output file: " << EC.message() << '\n'; exit(1); } - printSectionHotness(OS); + printSectionHotness(Stats, OS); } -void Heatmap::printSectionHotness(raw_ostream &OS) const { +StringMap<Heatmap::SectionStats> Heatmap::computeSectionStats() const { uint64_t NumTotalCounts = 0; - StringMap<uint64_t> SectionHotness; + StringMap<SectionStats> Stat; unsigned TextSectionIndex = 0; if (TextSections.empty()) - return; + return Stat; uint64_t UnmappedHotness = 0; auto RecordUnmappedBucket = [&](uint64_t Address, uint64_t Frequency) { - errs() << "Couldn't map the address bucket [0x" << Twine::utohexstr(Address) - << ", 0x" << Twine::utohexstr(Address + BucketSize) - << "] containing " << Frequency - << " samples to a text section in the binary."; + if (opts::Verbosity >= 1) + errs() << "Couldn't map the address bucket [" + << formatv("{0:x}, {1:x}", Address, Address + BucketSize) + << "] containing " << Frequency + << " samples to a text section in the binary.\n"; UnmappedHotness += Frequency; }; - for (const std::pair<const uint64_t, uint64_t> &KV : Map) { - NumTotalCounts += KV.second; + for (const auto [Bucket, Count] : Map) { + NumTotalCounts += Count; // We map an address bucket to the first section (lowest address) // overlapping with that bucket. - auto Address = KV.first * BucketSize; + auto Address = Bucket * BucketSize; while (TextSectionIndex < TextSections.size() && Address >= TextSections[TextSectionIndex].EndAddress) TextSectionIndex++; if (TextSectionIndex >= TextSections.size() || Address + BucketSize < TextSections[TextSectionIndex].BeginAddress) { - RecordUnmappedBucket(Address, KV.second); + RecordUnmappedBucket(Address, Count); continue; } - SectionHotness[TextSections[TextSectionIndex].Name] += KV.second; + SectionStats &SecStats = Stat[TextSections[TextSectionIndex].Name]; + ++SecStats.Buckets; + SecStats.Samples += Count; } + Stat["[total]"] = SectionStats{NumTotalCounts, Map.size()}; + if (UnmappedHotness) + Stat["[unmapped]"] = SectionStats{UnmappedHotness, 0}; + + return Stat; +} +void Heatmap::printSectionHotness(const StringMap<SectionStats> &Stats, + raw_ostream &OS) const { + if (TextSections.empty()) + return; + + auto TotalIt = Stats.find("[total]"); + assert(TotalIt != Stats.end() && "Malformed SectionStatsMap"); + const uint64_t NumTotalCounts = TotalIt->second.Samples; assert(NumTotalCounts > 0 && "total number of heatmap buckets should be greater than 0"); - OS << "Section Name, Begin Address, End Address, Percentage Hotness\n"; - for (auto &TextSection : TextSections) { - OS << TextSection.Name << ", 0x" - << Twine::utohexstr(TextSection.BeginAddress) << ", 0x" - << Twine::utohexstr(TextSection.EndAddress) << ", " - << format("%.4f", - 100.0 * SectionHotness[TextSection.Name] / NumTotalCounts) - << "\n"; + OS << "Section Name, Begin Address, End Address, Percentage Hotness, " + << "Utilization Pct\n"; + for (const auto [Name, Begin, End] : TextSections) { + uint64_t Samples = 0; + uint64_t Buckets = 0; + auto SectionIt = Stats.find(Name); + if (SectionIt != Stats.end()) { + Samples = SectionIt->second.Samples; + Buckets = SectionIt->second.Buckets; + } + const float RelHotness = 100. * Samples / NumTotalCounts; + const unsigned NumBuckets = + End / BucketSize + !!(End % BucketSize) - Begin / BucketSize; + const float BucketUtilization = 100. * Buckets / NumBuckets; + OS << formatv("{0}, {1:x}, {2:x}, {3:f4}, {4:f4}\n", Name, Begin, End, + RelHotness, BucketUtilization); } - if (UnmappedHotness > 0) - OS << "[unmapped], 0x0, 0x0, " - << format("%.4f", 100.0 * UnmappedHotness / NumTotalCounts) << "\n"; + auto UnmappedIt = Stats.find("[unmapped]"); + if (UnmappedIt == Stats.end()) + return; + const float UnmappedPct = 100. * UnmappedIt->second.Samples / NumTotalCounts; + OS << formatv("[unmapped], 0x0, 0x0, {0:f4}, 0\n", UnmappedPct); } } // namespace bolt } // namespace llvm diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp index f5636bf..88b806c 100644 --- a/bolt/lib/Profile/YAMLProfileReader.cpp +++ b/bolt/lib/Profile/YAMLProfileReader.cpp @@ -177,11 +177,11 @@ bool YAMLProfileReader::parseFunctionProfile( BF.setExecutionCount(YamlBF.ExecCount); - uint64_t FuncRawBranchCount = 0; + uint64_t FuncRawSampleCount = 0; for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) for (const yaml::bolt::SuccessorInfo &YamlSI : YamlBB.Successors) - FuncRawBranchCount += YamlSI.Count; - BF.setRawBranchCount(FuncRawBranchCount); + FuncRawSampleCount += YamlSI.Count; + BF.setRawSampleCount(FuncRawSampleCount); if (BF.empty()) return true; diff --git a/bolt/test/X86/heatmap-preagg.test b/bolt/test/X86/heatmap-preagg.test new file mode 100644 index 0000000..660d37f --- /dev/null +++ b/bolt/test/X86/heatmap-preagg.test @@ -0,0 +1,35 @@ +## Test heatmap with pre-aggregated profile + +RUN: yaml2obj %p/Inputs/blarge_new.yaml &> %t.exe +## Non-BOLTed input binary +RUN: llvm-bolt-heatmap %t.exe -o %t --pa -p %p/Inputs/blarge_new.preagg.txt \ +RUN: 2>&1 | FileCheck --check-prefix CHECK-HEATMAP %s +RUN: FileCheck %s --check-prefix CHECK-SEC-HOT --input-file %t-section-hotness.csv + +## BOLTed input binary +RUN: llvm-bolt %t.exe -o %t.out --pa -p %p/Inputs/blarge_new.preagg.txt \ +RUN: --reorder-blocks=ext-tsp --split-functions --split-strategy=cdsplit \ +RUN: --reorder-functions=cdsort --enable-bat --dyno-stats --skip-funcs=main +RUN: llvm-bolt-heatmap %t.out -o %t2 --pa -p %p/Inputs/blarge_new_bat.preagg.txt \ +RUN: 2>&1 | FileCheck --check-prefix CHECK-HEATMAP-BAT %s +RUN: FileCheck %s --check-prefix CHECK-SEC-HOT-BAT --input-file %t2-section-hotness.csv + +CHECK-HEATMAP: PERF2BOLT: read 81 aggregated LBR entries +CHECK-HEATMAP: HEATMAP: invalid traces: 1 + +CHECK-SEC-HOT: Section Name, Begin Address, End Address, Percentage Hotness, Utilization Pct +CHECK-SEC-HOT-NEXT: .init, 0x401000, 0x40101b, 16.8545, 100.0000 +CHECK-SEC-HOT-NEXT: .plt, 0x401020, 0x4010b0, 4.7583, 66.6667 +CHECK-SEC-HOT-NEXT: .text, 0x4010b0, 0x401c25, 78.3872, 85.1064 +CHECK-SEC-HOT-NEXT: .fini, 0x401c28, 0x401c35, 0.0000, 0.0000 + +CHECK-HEATMAP-BAT: PERF2BOLT: read 79 aggregated LBR entries +CHECK-HEATMAP-BAT: HEATMAP: invalid traces: 2 + +CHECK-SEC-HOT-BAT: Section Name, Begin Address, End Address, Percentage Hotness, Utilization Pct +CHECK-SEC-HOT-BAT-NEXT: .init, 0x401000, 0x40101b, 17.2888, 100.0000 +CHECK-SEC-HOT-BAT-NEXT: .plt, 0x401020, 0x4010b0, 5.6132, 66.6667 +CHECK-SEC-HOT-BAT-NEXT: .bolt.org.text, 0x4010b0, 0x401c25, 38.3385 +CHECK-SEC-HOT-BAT-NEXT: .fini, 0x401c28, 0x401c35, 0.0000, 0.0000 +CHECK-SEC-HOT-BAT-NEXT: .text, 0x800000, 0x8002cc, 38.7595, 91.6667 +CHECK-SEC-HOT-BAT-NEXT: .text.cold, 0x800300, 0x800415, 0.0000, 0.0000 diff --git a/bolt/test/perf2bolt/perf_test.test b/bolt/test/perf2bolt/perf_test.test index 7bec442..44111de 100644 --- a/bolt/test/perf2bolt/perf_test.test +++ b/bolt/test/perf2bolt/perf_test.test @@ -8,6 +8,7 @@ RUN: perf2bolt %t -p=%t2 -o %t3 -nl -ignore-build-id 2>&1 | FileCheck %s CHECK-NOT: PERF2BOLT-ERROR CHECK-NOT: !! WARNING !! This high mismatch ratio indicates the input binary is probably not the same binary used during profiling collection. +CHECK: BOLT-INFO: Functions with density >= {{.*}} account for 99.00% total sample counts. RUN: %clang %S/Inputs/perf_test.c -no-pie -fuse-ld=lld -o %t4 RUN: perf record -Fmax -e cycles:u -o %t5 -- %t4 |