diff options
author | Kyungwoo Lee <kyulee@meta.com> | 2024-10-09 15:37:41 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-10-09 15:37:41 -0700 |
commit | dc85d5263ed5e416cb4ddf405611472f4ef12fd3 (patch) | |
tree | 5025c790a7c3526f1919c5a9ae6142a1824fa457 /llvm/lib/LTO/LTO.cpp | |
parent | 4aba20fecaa09089132afe451aa04a22cd3794ca (diff) | |
download | llvm-dc85d5263ed5e416cb4ddf405611472f4ef12fd3.zip llvm-dc85d5263ed5e416cb4ddf405611472f4ef12fd3.tar.gz llvm-dc85d5263ed5e416cb4ddf405611472f4ef12fd3.tar.bz2 |
[CGData][ThinLTO] Global Outlining with Two-CodeGen Rounds (#90933)
This feature is enabled by `-codegen-data-thinlto-two-rounds`, which
effectively runs the `-codegen-data-generate` and `-codegen-data-use` in
two rounds to enable global outlining with ThinLTO.
1. The first round: Run both optimization + codegen with a scratch
output.
Before running codegen, we serialize the optimized bitcode modules to a
temporary path.
2. From the scratch object files, we merge them into the codegen data.
3. The second round: Read the optimized bitcode modules and start the
codegen only this time.
Using the codegen data, the machine outliner effectively performs the
global outlining.
Depends on #90934, #110461 and #110463.
This is a patch for
https://discourse.llvm.org/t/rfc-enhanced-machine-outliner-part-2-thinlto-nolto/78753.
Diffstat (limited to 'llvm/lib/LTO/LTO.cpp')
-rw-r--r-- | llvm/lib/LTO/LTO.cpp | 237 |
1 files changed, 232 insertions, 5 deletions
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index e1714b2..8e7675f 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -13,6 +13,7 @@ #include "llvm/LTO/LTO.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/StableHashing.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" @@ -21,6 +22,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/Bitcode/BitcodeWriter.h" +#include "llvm/CGData/CodeGenData.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/AutoUpgrade.h" @@ -35,6 +37,7 @@ #include "llvm/Linker/IRMover.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Object/IRObjectFile.h" +#include "llvm/Support/Caching.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Error.h" #include "llvm/Support/FileSystem.h" @@ -70,6 +73,8 @@ static cl::opt<bool> DumpThinCGSCCs("dump-thin-cg-sccs", cl::init(false), cl::Hidden, cl::desc("Dump the SCCs in the ThinLTO index's callgraph")); +extern cl::opt<bool> CodeGenDataThinLTOTwoRounds; + namespace llvm { /// Enable global value internalization in LTO. cl::opt<bool> EnableLTOInternalization( @@ -341,6 +346,20 @@ std::string llvm::computeLTOCacheKey( return toHex(Hasher.result()); } +std::string llvm::recomputeLTOCacheKey(const std::string &Key, + StringRef ExtraID) { + SHA1 Hasher; + + auto AddString = [&](StringRef Str) { + Hasher.update(Str); + Hasher.update(ArrayRef<uint8_t>{0}); + }; + AddString(Key); + AddString(ExtraID); + + return toHex(Hasher.result()); +} + static void thinLTOResolvePrevailingGUID( const Config &C, ValueInfo VI, DenseSet<GlobalValueSummary *> &GlobalInvolvedWithAlias, @@ -1398,6 +1417,7 @@ Error ThinBackendProc::emitFiles( namespace { class InProcessThinBackend : public ThinBackendProc { +protected: AddStreamFn AddStream; FileCache Cache; DenseSet<GlobalValue::GUID> CfiFunctionDefs; @@ -1424,7 +1444,7 @@ public: GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name))); } - Error runThinLTOBackendThread( + virtual Error runThinLTOBackendThread( AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM, ModuleSummaryIndex &CombinedIndex, const FunctionImporter::ImportMapTy &ImportList, @@ -1513,6 +1533,173 @@ public: return Error::success(); } }; + +/// This backend is utilized in the first round of a two-codegen round process. +/// It first saves optimized bitcode files to disk before the codegen process +/// begins. After codegen, it stores the resulting object files in a scratch +/// buffer. Note the codegen data stored in the scratch buffer will be extracted +/// and merged in the subsequent step. +class FirstRoundThinBackend : public InProcessThinBackend { + AddStreamFn IRAddStream; + FileCache IRCache; + +public: + FirstRoundThinBackend( + const Config &Conf, ModuleSummaryIndex &CombinedIndex, + ThreadPoolStrategy ThinLTOParallelism, + const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, + AddStreamFn CGAddStream, FileCache CGCache, AddStreamFn IRAddStream, + FileCache IRCache) + : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism, + ModuleToDefinedGVSummaries, std::move(CGAddStream), + std::move(CGCache), /*OnWrite=*/nullptr, + /*ShouldEmitIndexFiles=*/false, + /*ShouldEmitImportsFiles=*/false), + IRAddStream(std::move(IRAddStream)), IRCache(std::move(IRCache)) {} + + Error runThinLTOBackendThread( + AddStreamFn CGAddStream, FileCache CGCache, unsigned Task, + BitcodeModule BM, ModuleSummaryIndex &CombinedIndex, + const FunctionImporter::ImportMapTy &ImportList, + const FunctionImporter::ExportSetTy &ExportList, + const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR, + const GVSummaryMapTy &DefinedGlobals, + MapVector<StringRef, BitcodeModule> &ModuleMap) override { + auto RunThinBackend = [&](AddStreamFn CGAddStream, + AddStreamFn IRAddStream) { + LTOLLVMContext BackendContext(Conf); + Expected<std::unique_ptr<Module>> MOrErr = BM.parseModule(BackendContext); + if (!MOrErr) + return MOrErr.takeError(); + + return thinBackend(Conf, Task, CGAddStream, **MOrErr, CombinedIndex, + ImportList, DefinedGlobals, &ModuleMap, + Conf.CodeGenOnly, IRAddStream); + }; + + auto ModuleID = BM.getModuleIdentifier(); + // Like InProcessThinBackend, we produce index files as needed for + // FirstRoundThinBackend. However, these files are not generated for + // SecondRoundThinBackend. + if (ShouldEmitIndexFiles) { + if (auto E = emitFiles(ImportList, ModuleID, ModuleID.str())) + return E; + } + + assert((CGCache.isValid() == IRCache.isValid()) && + "Both caches for CG and IR should have matching availability"); + if (!CGCache.isValid() || !CombinedIndex.modulePaths().count(ModuleID) || + all_of(CombinedIndex.getModuleHash(ModuleID), + [](uint32_t V) { return V == 0; })) + // Cache disabled or no entry for this module in the combined index or + // no module hash. + return RunThinBackend(CGAddStream, IRAddStream); + + // Get CGKey for caching object in CGCache. + std::string CGKey = computeLTOCacheKey( + Conf, CombinedIndex, ModuleID, ImportList, ExportList, ResolvedODR, + DefinedGlobals, CfiFunctionDefs, CfiFunctionDecls); + Expected<AddStreamFn> CacheCGAddStreamOrErr = + CGCache(Task, CGKey, ModuleID); + if (Error Err = CacheCGAddStreamOrErr.takeError()) + return Err; + AddStreamFn &CacheCGAddStream = *CacheCGAddStreamOrErr; + + // Get IRKey for caching (optimized) IR in IRCache with an extra ID. + std::string IRKey = recomputeLTOCacheKey(CGKey, /*ExtraID=*/"IR"); + Expected<AddStreamFn> CacheIRAddStreamOrErr = + IRCache(Task, IRKey, ModuleID); + if (Error Err = CacheIRAddStreamOrErr.takeError()) + return Err; + AddStreamFn &CacheIRAddStream = *CacheIRAddStreamOrErr; + + // Ideally, both CG and IR caching should be synchronized. However, in + // practice, their availability may differ due to different expiration + // times. Therefore, if either cache is missing, the backend process is + // triggered. + if (CacheCGAddStream || CacheIRAddStream) { + LLVM_DEBUG(dbgs() << "[FirstRound] Cache Miss for " + << BM.getModuleIdentifier() << "\n"); + return RunThinBackend(CacheCGAddStream ? CacheCGAddStream : CGAddStream, + CacheIRAddStream ? CacheIRAddStream : IRAddStream); + } + + return Error::success(); + } +}; + +/// This backend operates in the second round of a two-codegen round process. +/// It starts by reading the optimized bitcode files that were saved during the +/// first round. The backend then executes the codegen only to further optimize +/// the code, utilizing the codegen data merged from the first round. Finally, +/// it writes the resulting object files as usual. +class SecondRoundThinBackend : public InProcessThinBackend { + std::unique_ptr<SmallVector<StringRef>> IRFiles; + stable_hash CombinedCGDataHash; + +public: + SecondRoundThinBackend( + const Config &Conf, ModuleSummaryIndex &CombinedIndex, + ThreadPoolStrategy ThinLTOParallelism, + const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, + AddStreamFn AddStream, FileCache Cache, + std::unique_ptr<SmallVector<StringRef>> IRFiles, + stable_hash CombinedCGDataHash) + : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism, + ModuleToDefinedGVSummaries, std::move(AddStream), + std::move(Cache), + /*OnWrite=*/nullptr, + /*ShouldEmitIndexFiles=*/false, + /*ShouldEmitImportsFiles=*/false), + IRFiles(std::move(IRFiles)), CombinedCGDataHash(CombinedCGDataHash) {} + + virtual Error runThinLTOBackendThread( + AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM, + ModuleSummaryIndex &CombinedIndex, + const FunctionImporter::ImportMapTy &ImportList, + const FunctionImporter::ExportSetTy &ExportList, + const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR, + const GVSummaryMapTy &DefinedGlobals, + MapVector<StringRef, BitcodeModule> &ModuleMap) override { + auto RunThinBackend = [&](AddStreamFn AddStream) { + LTOLLVMContext BackendContext(Conf); + std::unique_ptr<Module> LoadedModule = + cgdata::loadModuleForTwoRounds(BM, Task, BackendContext, *IRFiles); + + return thinBackend(Conf, Task, AddStream, *LoadedModule, CombinedIndex, + ImportList, DefinedGlobals, &ModuleMap, + /*CodeGenOnly=*/true); + }; + + auto ModuleID = BM.getModuleIdentifier(); + if (!Cache.isValid() || !CombinedIndex.modulePaths().count(ModuleID) || + all_of(CombinedIndex.getModuleHash(ModuleID), + [](uint32_t V) { return V == 0; })) + // Cache disabled or no entry for this module in the combined index or + // no module hash. + return RunThinBackend(AddStream); + + // Get Key for caching the final object file in Cache with the combined + // CGData hash. + std::string Key = computeLTOCacheKey( + Conf, CombinedIndex, ModuleID, ImportList, ExportList, ResolvedODR, + DefinedGlobals, CfiFunctionDefs, CfiFunctionDecls); + Key = recomputeLTOCacheKey(Key, + /*ExtraID=*/std::to_string(CombinedCGDataHash)); + Expected<AddStreamFn> CacheAddStreamOrErr = Cache(Task, Key, ModuleID); + if (Error Err = CacheAddStreamOrErr.takeError()) + return Err; + AddStreamFn &CacheAddStream = *CacheAddStreamOrErr; + + if (CacheAddStream) { + LLVM_DEBUG(dbgs() << "[SecondRound] Cache Miss for " + << BM.getModuleIdentifier() << "\n"); + return RunThinBackend(CacheAddStream); + } + + return Error::success(); + } +}; } // end anonymous namespace ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism, @@ -1855,10 +2042,50 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, return BackendProcess->wait(); }; - std::unique_ptr<ThinBackendProc> BackendProc = - ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries, - AddStream, Cache); - return RunBackends(BackendProc.get()); + if (!CodeGenDataThinLTOTwoRounds) { + std::unique_ptr<ThinBackendProc> BackendProc = + ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries, + AddStream, Cache); + return RunBackends(BackendProc.get()); + } + + // Perform two rounds of code generation for ThinLTO: + // 1. First round: Perform optimization and code generation, outputting to + // temporary scratch objects. + // 2. Merge code generation data extracted from the temporary scratch objects. + // 3. Second round: Execute code generation again using the merged data. + LLVM_DEBUG(dbgs() << "[TwoRounds] Initializing ThinLTO two-codegen rounds\n"); + + unsigned MaxTasks = getMaxTasks(); + auto Parallelism = ThinLTO.Backend.getParallelism(); + // Set up two additional streams and caches for storing temporary scratch + // objects and optimized IRs, using the same cache directory as the original. + cgdata::StreamCacheData CG(MaxTasks, Cache, "CG"), IR(MaxTasks, Cache, "IR"); + + // First round: Execute optimization and code generation, outputting to + // temporary scratch objects. Serialize the optimized IRs before initiating + // code generation. + LLVM_DEBUG(dbgs() << "[TwoRounds] Running the first round of codegen\n"); + auto FirstRoundLTO = std::make_unique<FirstRoundThinBackend>( + Conf, ThinLTO.CombinedIndex, Parallelism, ModuleToDefinedGVSummaries, + CG.AddStream, CG.Cache, IR.AddStream, IR.Cache); + if (Error E = RunBackends(FirstRoundLTO.get())) + return E; + + LLVM_DEBUG(dbgs() << "[TwoRounds] Merging codegen data\n"); + auto CombinedHashOrErr = cgdata::mergeCodeGenData(*CG.getResult()); + if (Error E = CombinedHashOrErr.takeError()) + return E; + auto CombinedHash = *CombinedHashOrErr; + LLVM_DEBUG(dbgs() << "[TwoRounds] CGData hash: " << CombinedHash << "\n"); + + // Second round: Read the optimized IRs and execute code generation using the + // merged data. + LLVM_DEBUG(dbgs() << "[TwoRounds] Running the second round of codegen\n"); + auto SecondRoundLTO = std::make_unique<SecondRoundThinBackend>( + Conf, ThinLTO.CombinedIndex, Parallelism, ModuleToDefinedGVSummaries, + AddStream, Cache, std::move(IR.getResult()), CombinedHash); + return RunBackends(SecondRoundLTO.get()); } Expected<std::unique_ptr<ToolOutputFile>> lto::setupLLVMOptimizationRemarks( |