aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/LTO/LTO.cpp
diff options
context:
space:
mode:
authorAlexandre Ganea <alexandre.ganea@ubisoft.com>2020-10-13 21:54:00 -0400
committerAlexandre Ganea <alexandre.ganea@ubisoft.com>2020-10-13 21:54:15 -0400
commit617d64f6c5f8fdcdacc4401704146247152b96aa (patch)
tree4015d1ac3864a3e42eaa24defbe1a1d2fca23824 /llvm/lib/LTO/LTO.cpp
parent8f8b9f2cca0b73314342c721186ae9c860ca273c (diff)
downloadllvm-617d64f6c5f8fdcdacc4401704146247152b96aa.zip
llvm-617d64f6c5f8fdcdacc4401704146247152b96aa.tar.gz
llvm-617d64f6c5f8fdcdacc4401704146247152b96aa.tar.bz2
Re-land [ThinLTO] Re-order modules for optimal multi-threaded processing
This reverts 9b5b3050237db3642ed7ab1bdb3ffa2202511b99 and fixes the unwanted re-ordering when generating ThinLTO indexes. The goal of this patch is to better balance thread utilization during ThinLTO in-process linking (in llvm-lto2 or in LLD). Before this patch, large modules would often be scheduled late during execution, taking a long time to complete, thus starving the thread pool. We now sort modules in descending order, based on each module's bitcode size, so that larger modules are processed first. By doing so, smaller modules have a better chance to keep the thread pool active, and thus avoid starvation when the bitcode compilation is almost complete. In our case (on dual Intel Xeon Gold 6140, Windows 10 version 2004, two-stage build), this saves 15 sec when linking `clang.exe` with LLD & -flto=thin, /opt:lldltojobs=all, no ThinLTO cache, -DLLVM_INTEGRATED_CRT_ALLOC=d:\git\rpmalloc. Before patch: 100 sec After patch: 85 sec Inspired by the work done by David Callahan in D60495. Differential Revision: https://reviews.llvm.org/D87966
Diffstat (limited to 'llvm/lib/LTO/LTO.cpp')
-rw-r--r--llvm/lib/LTO/LTO.cpp64
1 files changed, 54 insertions, 10 deletions
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 6230216..c2427dc0 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1107,6 +1107,7 @@ public:
const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
MapVector<StringRef, BitcodeModule> &ModuleMap) = 0;
virtual Error wait() = 0;
+ virtual unsigned getThreadCount() = 0;
};
namespace {
@@ -1221,6 +1222,10 @@ public:
else
return Error::success();
}
+
+ unsigned getThreadCount() override {
+ return BackendThreadPool.getThreadCount();
+ }
};
} // end anonymous namespace
@@ -1309,6 +1314,10 @@ public:
}
Error wait() override { return Error::success(); }
+
+ // WriteIndexesThinBackend should always return 1 to prevent module
+ // re-ordering and avoid non-determinism in the final link.
+ unsigned getThreadCount() override { return 1; }
};
} // end anonymous namespace
@@ -1443,17 +1452,37 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
auto &ModuleMap =
ThinLTO.ModulesToCompile ? *ThinLTO.ModulesToCompile : ThinLTO.ModuleMap;
- // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for combined
- // module and parallel code generation partitions.
- unsigned Task = RegularLTO.ParallelCodeGenParallelismLevel;
- for (auto &Mod : ModuleMap) {
- if (Error E = BackendProc->start(Task, Mod.second, ImportLists[Mod.first],
- ExportLists[Mod.first],
- ResolvedODR[Mod.first], ThinLTO.ModuleMap))
- return E;
- ++Task;
- }
+ auto ProcessOneModule = [&](int I) -> Error {
+ auto &Mod = *(ModuleMap.begin() + I);
+ // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for
+ // combined module and parallel code generation partitions.
+ return BackendProc->start(RegularLTO.ParallelCodeGenParallelismLevel + I,
+ Mod.second, ImportLists[Mod.first],
+ ExportLists[Mod.first], ResolvedODR[Mod.first],
+ ThinLTO.ModuleMap);
+ };
+ if (BackendProc->getThreadCount() == 1) {
+ // Process the modules in the order they were provided on the command-line.
+ // It is important for this codepath to be used for WriteIndexesThinBackend,
+ // to ensure the emitted LinkedObjectsFile lists ThinLTO objects in the same
+ // order as the inputs, which otherwise would affect the final link order.
+ for (int I = 0, E = ModuleMap.size(); I != E; ++I)
+ if (Error E = ProcessOneModule(I))
+ return E;
+ } else {
+ // When executing in parallel, process largest bitsize modules first to
+ // improve parallelism, and avoid starving the thread pool near the end.
+ // This saves about 15 sec on a 36-core machine while link `clang.exe` (out
+ // of 100 sec).
+ std::vector<BitcodeModule *> ModulesVec;
+ ModulesVec.reserve(ModuleMap.size());
+ for (auto &Mod : ModuleMap)
+ ModulesVec.push_back(&Mod.second);
+ for (int I : generateModulesOrdering(ModulesVec))
+ if (Error E = ProcessOneModule(I))
+ return E;
+ }
return BackendProc->wait();
}
@@ -1495,3 +1524,18 @@ lto::setupStatsFile(StringRef StatsFilename) {
StatsFile->keep();
return std::move(StatsFile);
}
+
+// Compute the ordering we will process the inputs: the rough heuristic here
+// is to sort them per size so that the largest module get schedule as soon as
+// possible. This is purely a compile-time optimization.
+std::vector<int> lto::generateModulesOrdering(ArrayRef<BitcodeModule *> R) {
+ std::vector<int> ModulesOrdering;
+ ModulesOrdering.resize(R.size());
+ std::iota(ModulesOrdering.begin(), ModulesOrdering.end(), 0);
+ llvm::sort(ModulesOrdering, [&](int LeftIndex, int RightIndex) {
+ auto LSize = R[LeftIndex]->getBuffer().size();
+ auto RSize = R[RightIndex]->getBuffer().size();
+ return LSize > RSize;
+ });
+ return ModulesOrdering;
+}