aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/LTO/LTO.cpp
diff options
context:
space:
mode:
authorMingming Liu <mingmingl@google.com>2024-09-08 14:52:03 -0700
committerGitHub <noreply@github.com>2024-09-08 14:52:03 -0700
commit9ade4e2646bd52b49e50c1648301da65de90ffa9 (patch)
treeea06bae8d6ef7ff603d526d2d5d43a9c014538ac /llvm/lib/LTO/LTO.cpp
parent80c47ad3aec9d7f22e1b1bdc88960a91b66f89f1 (diff)
downloadllvm-9ade4e2646bd52b49e50c1648301da65de90ffa9.zip
llvm-9ade4e2646bd52b49e50c1648301da65de90ffa9.tar.gz
llvm-9ade4e2646bd52b49e50c1648301da65de90ffa9.tar.bz2
[NFCI][LTO][lld] Optimize away symbol copies within LTO global resolution in ELF (#106193)
`StringMap<T>` creates a [copy of the string](https://github.com/llvm/llvm-project/blob/d4c519e7b2ac21350ec08b23eda44bf4a2d3c974/llvm/include/llvm/ADT/StringMapEntry.h#L55-L58) for entry insertions and intentionally keep copies [since the implementation optimizes string memory usage](https://github.com/llvm/llvm-project/blob/d4c519e7b2ac21350ec08b23eda44bf4a2d3c974/llvm/include/llvm/ADT/StringMap.h#L124). On the other hand, linker keeps copies of symbol names [1] in `lld::elf::parseFiles` [2] before invoking `compileBitcodeFiles` [3]. This change proposes to optimize away string copies inside [LTO::GlobalResolutions](https://github.com/llvm/llvm-project/blob/24e791b4164986a1ca7776e3ae0292ef20d20c47/llvm/include/llvm/LTO/LTO.h#L409), which will make LTO indexing more memory efficient for ELF. There are similar opportunities for other (COFF, wasm, MachO) formats. The optimization takes place for lld (ELF) only. For the rest of use cases (gold plugin, `llvm-lto2`, etc), LTO owns a string saver to keep copies and use global resolution key for de-duplication. Together with @kazutakahirata's work to make `ComputeCrossModuleImport` more memory efficient, we see a ~20% peak memory usage reduction in a binary where peak memory usage needs to go down. Thanks to the optimization in https://github.com/llvm/llvm-project/commit/329ba523ccbbe68a12434926c92fd9a86494d958, the max (as opposed to the sum) of `ComputeCrossModuleImport` or `GlobalResolution` shows up in peak memory usage. * Regarding correctness, the set of [resolved](https://github.com/llvm/llvm-project/blob/80c47ad3aec9d7f22e1b1bdc88960a91b66f89f1/llvm/lib/LTO/LTO.cpp#L739) [per-module symbols](https://github.com/llvm/llvm-project/blob/80c47ad3aec9d7f22e1b1bdc88960a91b66f89f1/llvm/include/llvm/LTO/LTO.h#L188-L191) is a subset of [llvm::lto::InputFile::Symbols](https://github.com/llvm/llvm-project/blob/80c47ad3aec9d7f22e1b1bdc88960a91b66f89f1/llvm/include/llvm/LTO/LTO.h#L120). And bitcode symbol parsing saves symbol name when iterating `obj->symbols` in `BitcodeFile::parse` already. This change updates `BitcodeFile::parseLazy` to keep copies of per-module undefined symbols. * Presumably the undefined symbols in a LTO unit (copied in this patch in linker unique saver) is a small set compared with the set of symbols in global-resolution (copied before this patch), making this a worthwhile trade-off. Benchmarking this change alone shows measurable memory savings across various benchmarks. [1] ELF https://github.com/llvm/llvm-project/blob/1cea5c2138bef3d8fec75508df6dbb858e6e3560/lld/ELF/InputFiles.cpp#L1748 [2] https://github.com/llvm/llvm-project/blob/ef7b18a53c0d186dcda1e322be6035407fdedb55/lld/ELF/Driver.cpp#L2863 [3] https://github.com/llvm/llvm-project/blob/ef7b18a53c0d186dcda1e322be6035407fdedb55/lld/ELF/Driver.cpp#L2995
Diffstat (limited to 'llvm/lib/LTO/LTO.cpp')
-rw-r--r--llvm/lib/LTO/LTO.cpp31
1 files changed, 27 insertions, 4 deletions
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 6807256..5d9a5cb 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -77,6 +77,10 @@ cl::opt<bool> EnableLTOInternalization(
"enable-lto-internalization", cl::init(true), cl::Hidden,
cl::desc("Enable global value internalization in LTO"));
+static cl::opt<bool>
+ LTOKeepSymbolCopies("lto-keep-symbol-copies", cl::init(false), cl::Hidden,
+ cl::desc("Keep copies of symbols in LTO indexing"));
+
/// Indicate we are linking with an allocator that supports hot/cold operator
/// new interfaces.
extern cl::opt<bool> SupportsHotColdNew;
@@ -587,8 +591,14 @@ LTO::LTO(Config Conf, ThinBackend Backend,
: Conf(std::move(Conf)),
RegularLTO(ParallelCodeGenParallelismLevel, this->Conf),
ThinLTO(std::move(Backend)),
- GlobalResolutions(std::make_optional<StringMap<GlobalResolution>>()),
- LTOMode(LTOMode) {}
+ GlobalResolutions(
+ std::make_unique<DenseMap<StringRef, GlobalResolution>>()),
+ LTOMode(LTOMode) {
+ if (Conf.KeepSymbolNameCopies || LTOKeepSymbolCopies) {
+ Alloc = std::make_unique<BumpPtrAllocator>();
+ GlobalResolutionSymbolSaver = std::make_unique<llvm::StringSaver>(*Alloc);
+ }
+}
// Requires a destructor for MapVector<BitcodeModule>.
LTO::~LTO() = default;
@@ -606,7 +616,12 @@ void LTO::addModuleToGlobalRes(ArrayRef<InputFile::Symbol> Syms,
assert(ResI != ResE);
SymbolResolution Res = *ResI++;
- auto &GlobalRes = (*GlobalResolutions)[Sym.getName()];
+ StringRef SymbolName = Sym.getName();
+ // Keep copies of symbols if the client of LTO says so.
+ if (GlobalResolutionSymbolSaver && !GlobalResolutions->contains(SymbolName))
+ SymbolName = GlobalResolutionSymbolSaver->save(SymbolName);
+
+ auto &GlobalRes = (*GlobalResolutions)[SymbolName];
GlobalRes.UnnamedAddr &= Sym.isUnnamedAddr();
if (Res.Prevailing) {
assert(!GlobalRes.Prevailing &&
@@ -660,6 +675,14 @@ void LTO::addModuleToGlobalRes(ArrayRef<InputFile::Symbol> Syms,
}
}
+void LTO::releaseGlobalResolutionsMemory() {
+ // Release GlobalResolutions dense-map itself.
+ GlobalResolutions.reset();
+ // Release the string saver memory.
+ GlobalResolutionSymbolSaver.reset();
+ Alloc.reset();
+}
+
static void writeToResolutionFile(raw_ostream &OS, InputFile *Input,
ArrayRef<SymbolResolution> Res) {
StringRef Path = Input->getName();
@@ -1771,7 +1794,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
// are no further accesses. We specifically want to do this before computing
// cross module importing, which adds to peak memory via the computed import
// and export lists.
- GlobalResolutions.reset();
+ releaseGlobalResolutionsMemory();
if (Conf.OptLevel > 0)
ComputeCrossModuleImport(ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,