12 files changed, 275 insertions, 208 deletions
diff --git a/lld/MachO/Arch/X86_64.cpp b/lld/MachO/Arch/X86_64.cpp
index 111c4d9..f351399 100644
--- a/lld/MachO/Arch/X86_64.cpp
+++ b/lld/MachO/Arch/X86_64.cpp
@@ -54,7 +54,8 @@ static constexpr std::array<RelocAttrs, 10> relocAttrsArray{{
     {"UNSIGNED", B(UNSIGNED) | B(ABSOLUTE) | B(EXTERN) | B(LOCAL) | B(BYTE1) |
                      B(BYTE4) | B(BYTE8)},
     {"SIGNED", B(PCREL) | B(EXTERN) | B(LOCAL) | B(BYTE4)},
-    {"BRANCH", B(PCREL) | B(EXTERN) | B(BRANCH) | B(BYTE1) | B(BYTE4)},
+    {"BRANCH",
+     B(PCREL) | B(EXTERN) | B(LOCAL) | B(BRANCH) | B(BYTE1) | B(BYTE4)},
     {"GOT_LOAD", B(PCREL) | B(EXTERN) | B(GOT) | B(LOAD) | B(BYTE4)},
     {"GOT", B(PCREL) | B(EXTERN) | B(GOT) | B(POINTER) | B(BYTE4)},
     {"SUBTRACTOR", B(SUBTRAHEND) | B(EXTERN) | B(BYTE4) | B(BYTE8)},
diff --git a/lld/MachO/ConcatOutputSection.cpp b/lld/MachO/ConcatOutputSection.cpp
index 8067d23..e559676 100644
--- a/lld/MachO/ConcatOutputSection.cpp
+++ b/lld/MachO/ConcatOutputSection.cpp
@@ -306,7 +306,7 @@ void TextOutputSection::finalize() {
     // contains several branch instructions in succession, then the distance
     // from the current position to the position where the thunks are inserted
     // grows. So leave room for a bunch of thunks.
-    unsigned slop = 256 * thunkSize;
+    unsigned slop = config->slopScale * thunkSize;
     while (finalIdx < endIdx) {
       uint64_t expectedNewSize =
           alignToPowerOf2(addr + size, inputs[finalIdx]->align) +
@@ -384,7 +384,9 @@ void TextOutputSection::finalize() {
         // above. If you hit this: For the current algorithm, just bumping up
         // slop above and trying again is probably simplest. (See also PR51578
         // comment 5).
-        fatal(Twine(__FUNCTION__) + ": FIXME: thunk range overrun");
+        fatal(Twine(__FUNCTION__) +
+              ": FIXME: thunk range overrun. Consider increasing the "
+              "slop-scale with `--slop-scale=<unsigned_int>`.");
       }
       thunkInfo.isec =
           makeSyntheticInputSection(isec->getSegName(), isec->getName());
diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h
index a2ca577..759a8cb 100644
--- a/lld/MachO/Config.h
+++ b/lld/MachO/Config.h
@@ -224,6 +224,7 @@ struct Configuration {
   bool disableVerify;
   bool separateCstringLiteralSections;
   bool tailMergeStrings;
+  unsigned slopScale = 256;
 
   bool callGraphProfileSort = false;
   llvm::StringRef printSymbolOrder;
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 32b2099..f4f3aba 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -41,6 +41,7 @@
 #include "llvm/Object/Archive.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Parallel.h"
 #include "llvm/Support/Path.h"
@@ -53,6 +54,10 @@
 #include "llvm/TextAPI/Architecture.h"
 #include "llvm/TextAPI/PackedVersion.h"
 
+#if !_WIN32
+#include <sys/mman.h>
+#endif
+
 using namespace llvm;
 using namespace llvm::MachO;
 using namespace llvm::object;
@@ -292,12 +297,13 @@ struct DeferredFile {
 using DeferredFiles = std::vector<DeferredFile>;
 
 #if LLVM_ENABLE_THREADS
-class SerialBackgroundQueue {
+class SerialBackgroundWorkQueue {
   std::deque<std::function<void()>> queue;
   std::thread *running;
   std::mutex mutex;
 
 public:
+  std::atomic_bool stopAllWork = false;
   void queueWork(std::function<void()> work) {
     mutex.lock();
     if (running && queue.empty()) {
@@ -312,7 +318,7 @@ public:
       queue.emplace_back(std::move(work));
       if (!running)
         running = new std::thread([&]() {
-          while (true) {
+          while (!stopAllWork) {
             mutex.lock();
             if (queue.empty()) {
               mutex.unlock();
@@ -331,6 +337,8 @@ public:
   }
 };
 
+static SerialBackgroundWorkQueue pageInQueue;
+
 // Most input files have been mapped but not yet paged in.
 // This code forces the page-ins on multiple threads so
 // the process is not stalled waiting on disk buffer i/o.
@@ -339,8 +347,8 @@ void multiThreadedPageInBackground(DeferredFiles &deferred) {
   static const size_t largeArchive = 10 * 1024 * 1024;
 #ifndef NDEBUG
   using namespace std::chrono;
-  std::atomic_int numDeferedFilesTouched = 0;
   static std::atomic_uint64_t totalBytes = 0;
+  std::atomic_int numDeferedFilesAdvised = 0;
   auto t0 = high_resolution_clock::now();
 #endif
 
@@ -348,24 +356,34 @@ void multiThreadedPageInBackground(DeferredFiles &deferred) {
     const StringRef &buff = deferredFile.buffer.getBuffer();
     if (buff.size() > largeArchive)
       return;
+
 #ifndef NDEBUG
     totalBytes += buff.size();
-    numDeferedFilesTouched += 1;
+    numDeferedFilesAdvised += 1;
 #endif
-
+#if _WIN32
     // Reference all file's mmap'd pages to load them into memory.
-    for (const char *page = buff.data(), *end = page + buff.size(); page < end;
-         page += pageSize) {
+    for (const char *page = buff.data(), *end = page + buff.size();
+         page < end && !pageInQueue.stopAllWork; page += pageSize) {
       [[maybe_unused]] volatile char t = *page;
       (void)t;
     }
+#else
+#define DEBUG_TYPE "lld-madvise"
+    auto aligned =
+        llvm::alignDown(reinterpret_cast<uintptr_t>(buff.data()), pageSize);
+    if (madvise((void *)aligned, buff.size(), MADV_WILLNEED) < 0)
+      LLVM_DEBUG(llvm::dbgs() << "madvise error: " << strerror(errno) << "\n");
+#undef DEBUG_TYPE
+#endif
   };
+
   { // Create scope for waiting for the taskGroup
     std::atomic_size_t index = 0;
     llvm::parallel::TaskGroup taskGroup;
     for (int w = 0; w < config->readWorkers; w++)
       taskGroup.spawn([&index, &preloadDeferredFile, &deferred]() {
-        while (true) {
+        while (!pageInQueue.stopAllWork) {
           size_t localIndex = index.fetch_add(1);
           if (localIndex >= deferred.size())
             break;
@@ -373,17 +391,17 @@ void multiThreadedPageInBackground(DeferredFiles &deferred) {
         }
       });
   }
+
 #ifndef NDEBUG
   auto dt = high_resolution_clock::now() - t0;
   if (Process::GetEnv("LLD_MULTI_THREAD_PAGE"))
     llvm::dbgs() << "multiThreadedPageIn " << totalBytes << "/"
-                 << numDeferedFilesTouched << "/" << deferred.size() << "/"
+                 << numDeferedFilesAdvised << "/" << deferred.size() << "/"
                  << duration_cast<milliseconds>(dt).count() / 1000. << "\n";
 #endif
 }
 
 static void multiThreadedPageIn(const DeferredFiles &deferred) {
-  static SerialBackgroundQueue pageInQueue;
   pageInQueue.queueWork([=]() {
     DeferredFiles files = deferred;
     multiThreadedPageInBackground(files);
@@ -489,7 +507,7 @@ static InputFile *processFile(std::optional<MemoryBufferRef> buffer,
             continue;
           }
 
-          if (archiveContents)
+          if (config->readWorkers && archiveContents)
             archiveContents->push_back({path, isLazy, *mb});
           if (!hasObjCSection(*mb))
             continue;
@@ -1447,6 +1465,8 @@ static void createFiles(const InputArgList &args) {
       multiThreadedPageIn(archiveContents);
     for (auto *archive : archives)
       archive->addLazySymbols();
+
+    pageInQueue.stopAllWork = true;
   }
 #endif
 }
@@ -1845,8 +1865,8 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
             "'");
     config->readWorkers = workers;
 #else
-    error(arg->getSpelling() +
-          ": option unavailable because lld was not built with thread support");
+    warn(arg->getSpelling() +
+         ": option unavailable because lld was not built with thread support");
 #endif
   }
   if (auto *arg = args.getLastArg(OPT_threads_eq)) {
@@ -1995,6 +2015,14 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
                    OPT_no_separate_cstring_literal_sections, false);
   config->tailMergeStrings =
       args.hasFlag(OPT_tail_merge_strings, OPT_no_tail_merge_strings, false);
+  if (auto *arg = args.getLastArg(OPT_slop_scale_eq)) {
+    StringRef v(arg->getValue());
+    unsigned slop = 0;
+    if (!llvm::to_integer(v, slop))
+      error(arg->getSpelling() +
+            ": expected a non-negative integer, but got '" + v + "'");
+    config->slopScale = slop;
+  }
 
   auto IncompatWithCGSort = [&](StringRef firstArgStr) {
     // Throw an error only if --call-graph-profile-sort is explicitly specified
diff --git a/lld/MachO/ICF.cpp b/lld/MachO/ICF.cpp
index 7b31378c3..e0fc897 100644
--- a/lld/MachO/ICF.cpp
+++ b/lld/MachO/ICF.cpp
@@ -173,14 +173,37 @@ bool ICF::equalsConstant(const ConcatInputSection *ia,
       // a valid offset in the literal section.
       return isecA->getOffset(valueA) == isecB->getOffset(valueB) &&
              ra.addend == rb.addend;
-    else {
-      assert(valueA == 0 && valueB == 0);
-      // For section relocs, we compare the content at the section offset.
-      return isecA->getOffset(ra.addend) == isecB->getOffset(rb.addend);
-    }
+    assert(valueA == 0 && valueB == 0);
+    // For section relocs, we compare the content at the section offset.
+    return isecA->getOffset(ra.addend) == isecB->getOffset(rb.addend);
   };
-  return std::equal(ia->relocs.begin(), ia->relocs.end(), ib->relocs.begin(),
-                    f);
+  if (!llvm::equal(ia->relocs, ib->relocs, f))
+    return false;
+
+  // Check unwind info structural compatibility: if there are symbols with
+  // associated unwind info, check that both sections have compatible symbol
+  // layouts. For simplicity, we only attempt folding when all symbols are at
+  // offset zero within the section (which is typically the case with
+  // .subsections_via_symbols.)
+  auto hasUnwind = [](Defined *d) { return d->unwindEntry() != nullptr; };
+  const auto *itA = llvm::find_if(ia->symbols, hasUnwind);
+  const auto *itB = llvm::find_if(ib->symbols, hasUnwind);
+  if (itA == ia->symbols.end())
+    return itB == ib->symbols.end();
+  if (itB == ib->symbols.end())
+    return false;
+  const Defined *da = *itA;
+  const Defined *db = *itB;
+  if (da->value != 0 || db->value != 0)
+    return false;
+  auto isZero = [](Defined *d) { return d->value == 0; };
+  // Since symbols are stored in order of value, and since we have already
+  // checked that da/db have value zero, we just need to do the isZero check on
+  // the subsequent symbols.
+  return std::find_if_not(std::next(itA), ia->symbols.end(), isZero) ==
+             ia->symbols.end() &&
+         std::find_if_not(std::next(itB), ib->symbols.end(), isZero) ==
+             ib->symbols.end();
 }
 
 // Compare the "moving" parts of two ConcatInputSections -- i.e. everything not
@@ -217,31 +240,19 @@ bool ICF::equalsVariable(const ConcatInputSection *ia,
     }
     return isecA->icfEqClass[icfPass % 2] == isecB->icfEqClass[icfPass % 2];
   };
-  if (!std::equal(ia->relocs.begin(), ia->relocs.end(), ib->relocs.begin(), f))
+  if (!llvm::equal(ia->relocs, ib->relocs, f))
     return false;
 
-  // If there are symbols with associated unwind info, check that the unwind
-  // info matches. For simplicity, we only handle the case where there are only
-  // symbols at offset zero within the section (which is typically the case with
-  // .subsections_via_symbols.)
+  // Compare unwind info equivalence classes.
   auto hasUnwind = [](Defined *d) { return d->unwindEntry() != nullptr; };
   const auto *itA = llvm::find_if(ia->symbols, hasUnwind);
-  const auto *itB = llvm::find_if(ib->symbols, hasUnwind);
   if (itA == ia->symbols.end())
-    return itB == ib->symbols.end();
-  if (itB == ib->symbols.end())
-    return false;
+    return true;
   const Defined *da = *itA;
-  const Defined *db = *itB;
-  if (da->unwindEntry()->icfEqClass[icfPass % 2] !=
-          db->unwindEntry()->icfEqClass[icfPass % 2] ||
-      da->value != 0 || db->value != 0)
-    return false;
-  auto isZero = [](Defined *d) { return d->value == 0; };
-  return std::find_if_not(std::next(itA), ia->symbols.end(), isZero) ==
-             ia->symbols.end() &&
-         std::find_if_not(std::next(itB), ib->symbols.end(), isZero) ==
-             ib->symbols.end();
+  // equalsConstant() guarantees that both sections have unwind info.
+  const Defined *db = *llvm::find_if(ib->symbols, hasUnwind);
+  return da->unwindEntry()->icfEqClass[icfPass % 2] ==
+         db->unwindEntry()->icfEqClass[icfPass % 2];
 }
 
 // Find the first InputSection after BEGIN whose equivalence class differs
diff --git a/lld/MachO/InputFiles.cpp b/lld/MachO/InputFiles.cpp
index 20e4a1d..81caef5 100644
--- a/lld/MachO/InputFiles.cpp
+++ b/lld/MachO/InputFiles.cpp
@@ -217,7 +217,8 @@ std::optional<MemoryBufferRef> macho::readFile(StringRef path) {
   if (entry != cachedReads.end())
     return entry->second;
 
-  ErrorOr<std::unique_ptr<MemoryBuffer>> mbOrErr = MemoryBuffer::getFile(path);
+  ErrorOr<std::unique_ptr<MemoryBuffer>> mbOrErr =
+      MemoryBuffer::getFile(path, false, /*RequiresNullTerminator=*/false);
   if (std::error_code ec = mbOrErr.getError()) {
     error("cannot open " + path + ": " + ec.message());
     return std::nullopt;
@@ -594,8 +595,8 @@ void ObjFile::parseRelocations(ArrayRef<SectionHeader> sectionHeaders,
         // FIXME This logic was written around x86_64 behavior -- ARM64 doesn't
         // have pcrel section relocations. We may want to factor this out into
         // the arch-specific .cpp file.
-        assert(target->hasAttr(r.type, RelocAttrBits::BYTE4));
-        referentOffset = sec.addr + relInfo.r_address + 4 + totalAddend -
+        referentOffset = sec.addr + relInfo.r_address +
+                         (1ull << relInfo.r_length) + totalAddend -
                          referentSecHead.addr;
       } else {
         // The addend for a non-pcrel relocation is its absolute address.
@@ -808,6 +809,17 @@ void ObjFile::parseSymbols(ArrayRef<typename LP::section> sectionHeaders,
       continue;
 
     if ((sym.n_type & N_TYPE) == N_SECT) {
+      if (sym.n_sect == 0) {
+        fatal("section symbol " + StringRef(strtab + sym.n_strx) + " in " +
+              toString(this) + " has an invalid section index [0]");
+      }
+      if (sym.n_sect > sections.size()) {
+        fatal("section symbol " + StringRef(strtab + sym.n_strx) + " in " +
+              toString(this) + " has an invalid section index [" +
+              Twine(static_cast<unsigned>(sym.n_sect)) +
+              "] greater than the total number of sections [" +
+              Twine(sections.size()) + "]");
+      }
       Subsections &subsections = sections[sym.n_sect - 1]->subsections;
       // parseSections() may have chosen not to parse this section.
       if (subsections.empty())
diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td
index be1a1cc..f5f7549 100644
--- a/lld/MachO/Options.td
+++ b/lld/MachO/Options.td
@@ -1095,6 +1095,14 @@ defm tail_merge_strings
     : BB<"tail-merge-strings", "Enable string tail merging",
          "Disable string tail merging to improve link-time performance">,
       Group<grp_rare>;
+def slop_scale_eq
+    : Joined<["--"], "slop_scale=">,
+      MetaVarName<"<unsigned_int>">,
+      HelpText<"Specify the slop scale. Default value is 256. If your binary "
+               "has too many consecutive branch instructions resulting in "
+               "thunk-range overrun, then you need to increase this value to a "
+               "higher value, such as 512 or 1024, etc">,
+      Group<grp_rare>;
 
 def grp_deprecated : OptionGroup<"deprecated">, HelpText<"DEPRECATED">;
 
diff --git a/lld/MachO/SectionPriorities.cpp b/lld/MachO/SectionPriorities.cpp
index cf657aa..b652d1e 100644
--- a/lld/MachO/SectionPriorities.cpp
+++ b/lld/MachO/SectionPriorities.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/xxhash.h"
 
 #include <numeric>
 
@@ -246,33 +247,45 @@ DenseMap<const InputSection *, int> CallGraphSort::run() {
   return orderMap;
 }
 
-std::optional<int>
-macho::PriorityBuilder::getSymbolOrCStringPriority(const StringRef key,
-                                                   InputFile *f) {
+void macho::PriorityBuilder::SymbolPriorityEntry::setPriority(
+    int priority, StringRef objectFile) {
+  if (!objectFile.empty())
+    objectFiles.try_emplace(objectFile, priority);
+  else
+    anyObjectFile = std::min(anyObjectFile, priority);
+}
 
-  auto it = priorities.find(key);
-  if (it == priorities.end())
-    return std::nullopt;
-  const SymbolPriorityEntry &entry = it->second;
+int macho::PriorityBuilder::SymbolPriorityEntry::getPriority(
+    const InputFile *f) const {
   if (!f)
-    return entry.anyObjectFile;
+    return anyObjectFile;
   // We don't use toString(InputFile *) here because it returns the full path
   // for object files, and we only want the basename.
-  StringRef filename;
-  if (f->archiveName.empty())
-    filename = path::filename(f->getName());
-  else
-    filename = saver().save(path::filename(f->archiveName) + "(" +
-                            path::filename(f->getName()) + ")");
-  return std::min(entry.objectFiles.lookup(filename), entry.anyObjectFile);
+  StringRef basename = path::filename(f->getName());
+  StringRef filename =
+      f->archiveName.empty()
+          ? basename
+          : saver().save(path::filename(f->archiveName) + "(" + basename + ")");
+  return std::min(objectFiles.lookup(filename), anyObjectFile);
 }
 
 std::optional<int>
-macho::PriorityBuilder::getSymbolPriority(const Defined *sym) {
+macho::PriorityBuilder::getCStringPriority(uint32_t hash,
+                                           const InputFile *f) const {
+  auto it = cStringPriorities.find(hash);
+  if (it == cStringPriorities.end())
+    return std::nullopt;
+  return it->second.getPriority(f);
+}
+
+std::optional<int>
+macho::PriorityBuilder::getSymbolPriority(const Defined *sym) const {
   if (sym->isAbsolute())
     return std::nullopt;
-  return getSymbolOrCStringPriority(utils::getRootSymbol(sym->getName()),
-                                    sym->isec()->getFile());
+  auto it = priorities.find(utils::getRootSymbol(sym->getName()));
+  if (it == priorities.end())
+    return std::nullopt;
+  return it->second.getPriority(sym->isec()->getFile());
 }
 
 void macho::PriorityBuilder::extractCallGraphProfile() {
@@ -307,7 +320,7 @@ void macho::PriorityBuilder::parseOrderFile(StringRef path) {
   int prio = std::numeric_limits<int>::min();
   MemoryBufferRef mbref = *buffer;
   for (StringRef line : args::getLines(mbref)) {
-    StringRef objectFile, symbolOrCStrHash;
+    StringRef objectFile;
     line = line.take_until([](char c) { return c == '#'; }); // ignore comments
     line = line.ltrim();
 
@@ -338,22 +351,16 @@ void macho::PriorityBuilder::parseOrderFile(StringRef path) {
     }
 
     // The rest of the line is either <symbol name> or
-    // CStringEntryPrefix<cstring hash>
+    // cStringEntryPrefix<cstring hash>
     line = line.trim();
-    if (line.starts_with(CStringEntryPrefix)) {
-      StringRef possibleHash = line.drop_front(CStringEntryPrefix.size());
+    if (line.consume_front(cStringEntryPrefix)) {
       uint32_t hash = 0;
-      if (to_integer(possibleHash, hash))
-        symbolOrCStrHash = possibleHash;
-    } else
-      symbolOrCStrHash = utils::getRootSymbol(line);
-
-    if (!symbolOrCStrHash.empty()) {
-      SymbolPriorityEntry &entry = priorities[symbolOrCStrHash];
-      if (!objectFile.empty())
-        entry.objectFiles.insert(std::make_pair(objectFile, prio));
-      else
-        entry.anyObjectFile = std::min(entry.anyObjectFile, prio);
+      if (to_integer(line, hash))
+        cStringPriorities[hash].setPriority(prio, objectFile);
+    } else {
+      StringRef symbol = utils::getRootSymbol(line);
+      if (!symbol.empty())
+        priorities[symbol].setPriority(prio, objectFile);
     }
 
     ++prio;
@@ -405,40 +412,39 @@ macho::PriorityBuilder::buildInputSectionPriorities() {
   return sectionPriorities;
 }
 
-std::vector<StringPiecePair> macho::PriorityBuilder::buildCStringPriorities(
-    ArrayRef<CStringInputSection *> inputs) {
-  // Split the input strings into hold and cold sets.
-  // Order hot set based on -order_file_cstring for performance improvement;
-  // TODO: Order cold set of cstrings for compression via BP.
-  std::vector<std::pair<int, StringPiecePair>>
-      hotStringPrioritiesAndStringPieces;
-  std::vector<StringPiecePair> coldStringPieces;
-  std::vector<StringPiecePair> orderedStringPieces;
-
+void macho::PriorityBuilder::forEachStringPiece(
+    ArrayRef<CStringInputSection *> inputs,
+    std::function<void(CStringInputSection &, StringPiece &, size_t)> f,
+    bool forceInputOrder, bool computeHash) const {
+  std::vector<std::tuple<int, CStringInputSection *, size_t>> orderedPieces;
+  std::vector<std::pair<CStringInputSection *, size_t>> unorderedPieces;
   for (CStringInputSection *isec : inputs) {
     for (const auto &[stringPieceIdx, piece] : llvm::enumerate(isec->pieces)) {
       if (!piece.live)
         continue;
-
-      std::optional<int> priority = getSymbolOrCStringPriority(
-          std::to_string(piece.hash), isec->getFile());
-      if (!priority)
-        coldStringPieces.emplace_back(isec, stringPieceIdx);
+      // Process pieces in input order if we have no cstrings in our orderfile
+      if (forceInputOrder || cStringPriorities.empty()) {
+        f(*isec, piece, stringPieceIdx);
+        continue;
+      }
+      uint32_t hash =
+          computeHash
+              ? (xxh3_64bits(isec->getStringRef(stringPieceIdx)) & 0x7fffffff)
+              : piece.hash;
+      if (auto priority = getCStringPriority(hash, isec->getFile()))
+        orderedPieces.emplace_back(*priority, isec, stringPieceIdx);
       else
-        hotStringPrioritiesAndStringPieces.emplace_back(
-            *priority, std::make_pair(isec, stringPieceIdx));
+        unorderedPieces.emplace_back(isec, stringPieceIdx);
     }
   }
-
-  // Order hot set for perf
-  llvm::stable_sort(hotStringPrioritiesAndStringPieces);
-  for (auto &[priority, stringPiecePair] : hotStringPrioritiesAndStringPieces)
-    orderedStringPieces.push_back(stringPiecePair);
-
-  // TODO: Order cold set for compression
-
-  orderedStringPieces.insert(orderedStringPieces.end(),
-                             coldStringPieces.begin(), coldStringPieces.end());
-
-  return orderedStringPieces;
+  if (orderedPieces.empty() && unorderedPieces.empty())
+    return;
+  llvm::stable_sort(orderedPieces, [](const auto &left, const auto &right) {
+    return std::get<0>(left) < std::get<0>(right);
+  });
+  for (auto &[priority, isec, pieceIdx] : orderedPieces)
+    f(*isec, isec->pieces[pieceIdx], pieceIdx);
+  // TODO: Add option to order the remaining cstrings for compression
+  for (auto &[isec, pieceIdx] : unorderedPieces)
+    f(*isec, isec->pieces[pieceIdx], pieceIdx);
 }
diff --git a/lld/MachO/SectionPriorities.h b/lld/MachO/SectionPriorities.h
index cc4e30f..24d2dbc 100644
--- a/lld/MachO/SectionPriorities.h
+++ b/lld/MachO/SectionPriorities.h
@@ -16,7 +16,6 @@
 namespace lld::macho {
 
 using SectionPair = std::pair<const InputSection *, const InputSection *>;
-using StringPiecePair = std::pair<CStringInputSection *, size_t>;
 
 class PriorityBuilder {
 public:
@@ -29,7 +28,7 @@ public:
   //
   // An order file has one entry per line, in the following format:
   //
-  //   <cpu>:<object file>:[<symbol name> | CStringEntryPrefix <cstring hash>]
+  //   <cpu>:<object file>:[<symbol name> | cStringEntryPrefix <cstring hash>]
   //
   // <cpu> and <object file> are optional.
   // If not specified, then that entry tries to match either,
@@ -42,7 +41,7 @@ public:
   // lowest-ordered entry (the one nearest to the front of the list.)
   //
   // or 2) any cstring literal with the given hash, if the entry has the
-  // CStringEntryPrefix prefix defined below in the file. <cstring hash> is the
+  // cStringEntryPrefix prefix defined below in the file. <cstring hash> is the
   // hash of cstring literal content.
   //
   // Cstring literals are not symbolized, we can't identify them by name
@@ -54,6 +53,16 @@ public:
   // The file can also have line comments that start with '#'.
   void parseOrderFile(StringRef path);
 
+  /// Call \p f for each string piece in \p inputs. If there are any cstring
+  /// literals in the orderfile (and \p forceInputOrder is false) then string
+  /// pieces are ordered by the orderfile. \p computeHash must be set when
+  /// \p deduplicateLiterals is false because then the string piece hash is not
+  /// set.
+  void forEachStringPiece(
+      ArrayRef<CStringInputSection *> inputs,
+      std::function<void(CStringInputSection &, StringPiece &, size_t)> f,
+      bool forceInputOrder = false, bool computeHash = false) const;
+
   // Returns layout priorities for some or all input sections. Sections are laid
   // out in decreasing order; that is, a higher priority section will be closer
   // to the beginning of its output section.
@@ -66,8 +75,6 @@ public:
   // Each section gets assigned the priority of the highest-priority symbol it
   // contains.
   llvm::DenseMap<const InputSection *, int> buildInputSectionPriorities();
-  std::vector<StringPiecePair>
-      buildCStringPriorities(ArrayRef<CStringInputSection *>);
 
 private:
   // The symbol with the smallest priority should be ordered first in the output
@@ -78,13 +85,16 @@ private:
     int anyObjectFile = 0;
     // The priority given to a matching symbol from a particular object file.
     llvm::DenseMap<llvm::StringRef, int> objectFiles;
+    void setPriority(int priority, StringRef objectFile);
+    int getPriority(const InputFile *f) const;
   };
-  const llvm::StringRef CStringEntryPrefix = "CSTR;";
+  const llvm::StringRef cStringEntryPrefix = "CSTR;";
 
-  std::optional<int> getSymbolPriority(const Defined *sym);
-  std::optional<int> getSymbolOrCStringPriority(const StringRef key,
-                                                InputFile *f);
+  std::optional<int> getSymbolPriority(const Defined *sym) const;
+  std::optional<int> getCStringPriority(uint32_t hash,
+                                        const InputFile *f) const;
   llvm::DenseMap<llvm::StringRef, SymbolPriorityEntry> priorities;
+  llvm::DenseMap<uint32_t, SymbolPriorityEntry> cStringPriorities;
   llvm::MapVector<SectionPair, uint64_t> callGraphProfile;
 };
 
diff --git a/lld/MachO/SymbolTable.cpp b/lld/MachO/SymbolTable.cpp
index baddddc..a7db5a3a 100644
--- a/lld/MachO/SymbolTable.cpp
+++ b/lld/MachO/SymbolTable.cpp
@@ -61,8 +61,8 @@ struct DuplicateSymbolDiag {
 SmallVector<DuplicateSymbolDiag> dupSymDiags;
 } // namespace
 
-// Move symbols at \p fromOff in \p fromIsec into \p toIsec, unless that symbol
-// is \p skip.
+// Move local symbols at \p fromOff in \p fromIsec into \p toIsec, unless that
+// symbol is \p skip, in which case we just remove it.
 static void transplantSymbolsAtOffset(InputSection *fromIsec,
                                       InputSection *toIsec, Defined *skip,
                                       uint64_t fromOff, uint64_t toOff) {
@@ -78,22 +78,23 @@ static void transplantSymbolsAtOffset(InputSection *fromIsec,
   auto insertIt = llvm::upper_bound(toIsec->symbols, toOff, symSucceedsOff);
   llvm::erase_if(fromIsec->symbols, [&](Symbol *s) {
     auto *d = cast<Defined>(s);
-    if (d->value != fromOff)
+    if (d == skip)
+      return true;
+    if (d->value != fromOff || d->isExternal())
       return false;
-    if (d != skip) {
-      // This repeated insertion will be quadratic unless insertIt is the end
-      // iterator. However, that is typically the case for files that have
-      // .subsections_via_symbols set.
-      insertIt = toIsec->symbols.insert(insertIt, d);
-      d->originalIsec = toIsec;
-      d->value = toOff;
-      // We don't want to have more than one unwindEntry at a given address, so
-      // drop the redundant ones. We We can safely drop the unwindEntries of
-      // the symbols in fromIsec since we will be adding another unwindEntry as
-      // we finish parsing toIsec's file. (We can assume that toIsec has its
-      // own unwindEntry because of the ODR.)
-      d->originalUnwindEntry = nullptr;
-    }
+
+    // This repeated insertion will be quadratic unless insertIt is the end
+    // iterator. However, that is typically the case for files that have
+    // .subsections_via_symbols set.
+    insertIt = toIsec->symbols.insert(insertIt, d);
+    d->originalIsec = toIsec;
+    d->value = toOff;
+    // We don't want to have more than one unwindEntry at a given address, so
+    // drop the redundant ones. We can safely drop the unwindEntries of the
+    // symbols in fromIsec since we will be adding another unwindEntry as we
+    // finish parsing toIsec's file. (We can assume that toIsec has its own
+    // unwindEntry because of the ODR.)
+    d->originalUnwindEntry = nullptr;
     return true;
   });
 }
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
index 187cccb..fecc51f 100644
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -1721,26 +1721,24 @@ void CStringSection::writeTo(uint8_t *buf) const {
 // and don't need this alignment. They will be emitted at some arbitrary address
 // `A`, but ld64 will treat them as being 16-byte aligned with an offset of
 // `16 % A`.
-static Align getStringPieceAlignment(const CStringInputSection *isec,
+static Align getStringPieceAlignment(const CStringInputSection &isec,
                                      const StringPiece &piece) {
-  return llvm::Align(1ULL << llvm::countr_zero(isec->align | piece.inSecOff));
+  return llvm::Align(1ULL << llvm::countr_zero(isec.align | piece.inSecOff));
 }
 
 void CStringSection::finalizeContents() {
   size = 0;
-  // TODO: Call buildCStringPriorities() to support cstring ordering when
-  // deduplication is off, although this may negatively impact build
-  // performance.
-  for (CStringInputSection *isec : inputs) {
-    for (const auto &[i, piece] : llvm::enumerate(isec->pieces)) {
-      if (!piece.live)
-        continue;
-      piece.outSecOff = alignTo(size, getStringPieceAlignment(isec, piece));
-      StringRef string = isec->getStringRef(i);
-      size = piece.outSecOff + string.size() + 1; // account for null terminator
-    }
+  priorityBuilder.forEachStringPiece(
+      inputs,
+      [&](CStringInputSection &isec, StringPiece &piece, size_t pieceIdx) {
+        piece.outSecOff = alignTo(size, getStringPieceAlignment(isec, piece));
+        StringRef string = isec.getStringRef(pieceIdx);
+        size =
+            piece.outSecOff + string.size() + 1; // account for null terminator
+      },
+      /*forceInputOrder=*/false, /*computeHash=*/true);
+  for (CStringInputSection *isec : inputs)
     isec->isFinal = true;
-  }
 }
 
 void DeduplicatedCStringSection::finalizeContents() {
@@ -1748,20 +1746,19 @@ void DeduplicatedCStringSection::finalizeContents() {
   DenseMap<CachedHashStringRef, Align> strToAlignment;
   // Used for tail merging only
   std::vector<CachedHashStringRef> deduplicatedStrs;
-  for (const CStringInputSection *isec : inputs) {
-    for (const auto &[i, piece] : llvm::enumerate(isec->pieces)) {
-      if (!piece.live)
-        continue;
-      auto s = isec->getCachedHashStringRef(i);
-      assert(isec->align != 0);
-      auto align = getStringPieceAlignment(isec, piece);
-      auto [it, wasInserted] = strToAlignment.try_emplace(s, align);
-      if (config->tailMergeStrings && wasInserted)
-        deduplicatedStrs.push_back(s);
-      if (!wasInserted && it->second < align)
-        it->second = align;
-    }
-  }
+  priorityBuilder.forEachStringPiece(
+      inputs,
+      [&](CStringInputSection &isec, StringPiece &piece, size_t pieceIdx) {
+        auto s = isec.getCachedHashStringRef(pieceIdx);
+        assert(isec.align != 0);
+        auto align = getStringPieceAlignment(isec, piece);
+        auto [it, wasInserted] = strToAlignment.try_emplace(s, align);
+        if (config->tailMergeStrings && wasInserted)
+          deduplicatedStrs.push_back(s);
+        if (!wasInserted && it->second < align)
+          it->second = align;
+      },
+      /*forceInputOrder=*/true);
 
   // Like lexigraphical sort, except we read strings in reverse and take the
   // longest string first
@@ -1801,9 +1798,10 @@ void DeduplicatedCStringSection::finalizeContents() {
   // Sort the strings for performance and compression size win, and then
   // assign an offset for each string and save it to the corresponding
   // StringPieces for easy access.
-  for (auto &[isec, i] : priorityBuilder.buildCStringPriorities(inputs)) {
-    auto &piece = isec->pieces[i];
-    auto s = isec->getCachedHashStringRef(i);
+  priorityBuilder.forEachStringPiece(inputs, [&](CStringInputSection &isec,
+                                                 StringPiece &piece,
+                                                 size_t pieceIdx) {
+    auto s = isec.getCachedHashStringRef(pieceIdx);
     // Any string can be tail merged with itself with an offset of zero
     uint64_t tailMergeOffset = 0;
     auto mergeIt =
@@ -1829,7 +1827,7 @@ void DeduplicatedCStringSection::finalizeContents() {
       stringOffsetMap[tailMergedString] = piece.outSecOff;
       assert(isAligned(strToAlignment.at(tailMergedString), piece.outSecOff));
     }
-  }
+  });
   for (CStringInputSection *isec : inputs)
     isec->isFinal = true;
 }
diff --git a/lld/MachO/UnwindInfoSection.cpp b/lld/MachO/UnwindInfoSection.cpp
index 6e9f6c2..bf01b12 100644
--- a/lld/MachO/UnwindInfoSection.cpp
+++ b/lld/MachO/UnwindInfoSection.cpp
@@ -153,8 +153,6 @@ private:
   // The entries here will be in the same order as their originating symbols
   // in symbolsVec.
   std::vector<CompactUnwindEntry> cuEntries;
-  // Indices into the cuEntries vector.
-  std::vector<size_t> cuIndices;
   std::vector<Symbol *> personalities;
   SmallDenseMap<std::pair<InputSection *, uint64_t /* addend */>, Symbol *>
       personalityTable;
@@ -400,8 +398,7 @@ void UnwindInfoSectionImpl::relocateCompactUnwind(
 // There should only be a handful of unique personality pointers, so we can
 // encode them as 2-bit indices into a small array.
 void UnwindInfoSectionImpl::encodePersonalities() {
-  for (size_t idx : cuIndices) {
-    CompactUnwindEntry &cu = cuEntries[idx];
+  for (CompactUnwindEntry &cu : cuEntries) {
     if (cu.personality == nullptr)
       continue;
     // Linear search is fast enough for a small array.
@@ -467,27 +464,24 @@ void UnwindInfoSectionImpl::finalize() {
   symbolsVec = symbols.takeVector();
   relocateCompactUnwind(cuEntries);
 
-  // Rather than sort & fold the 32-byte entries directly, we create a
-  // vector of indices to entries and sort & fold that instead.
-  cuIndices.resize(cuEntries.size());
-  std::iota(cuIndices.begin(), cuIndices.end(), 0);
-  llvm::sort(cuIndices, [&](size_t a, size_t b) {
-    return cuEntries[a].functionAddress < cuEntries[b].functionAddress;
+  // Sort the entries by address.
+  llvm::sort(cuEntries, [&](auto &a, auto &b) {
+    return a.functionAddress < b.functionAddress;
   });
 
   // Record the ending boundary before we fold the entries.
-  cueEndBoundary = cuEntries[cuIndices.back()].functionAddress +
-                   cuEntries[cuIndices.back()].functionLength;
+  cueEndBoundary =
+      cuEntries.back().functionAddress + cuEntries.back().functionLength;
 
   // Fold adjacent entries with matching encoding+personality and without LSDA
-  // We use three iterators on the same cuIndices to fold in-situ:
+  // We use three iterators to fold in-situ:
   // (1) `foldBegin` is the first of a potential sequence of matching entries
   // (2) `foldEnd` is the first non-matching entry after `foldBegin`.
   // The semi-open interval [ foldBegin .. foldEnd ) contains a range
   // entries that can be folded into a single entry and written to ...
   // (3) `foldWrite`
-  auto foldWrite = cuIndices.begin();
-  for (auto foldBegin = cuIndices.begin(); foldBegin < cuIndices.end();) {
+  auto foldWrite = cuEntries.begin();
+  for (auto foldBegin = cuEntries.begin(); foldBegin != cuEntries.end();) {
     auto foldEnd = foldBegin;
     // Common LSDA encodings (e.g. for C++ and Objective-C) contain offsets from
     // a base address. The base address is normally not contained directly in
@@ -503,9 +497,9 @@ void UnwindInfoSectionImpl::finalize() {
     // directly in the LSDA, two functions at different addresses would
     // necessarily have different LSDAs, so their CU entries would not have been
     // folded anyway.
-    while (++foldEnd < cuIndices.end() &&
-           cuEntries[*foldBegin].encoding == cuEntries[*foldEnd].encoding &&
-           !cuEntries[*foldBegin].lsda && !cuEntries[*foldEnd].lsda &&
+    while (++foldEnd != cuEntries.end() &&
+           foldBegin->encoding == foldEnd->encoding && !foldBegin->lsda &&
+           !foldEnd->lsda &&
            // If we've gotten to this point, we don't have an LSDA, which should
            // also imply that we don't have a personality function, since in all
            // likelihood a personality function needs the LSDA to do anything
@@ -513,21 +507,20 @@ void UnwindInfoSectionImpl::finalize() {
            // and no LSDA though (e.g. the C++ personality __gxx_personality_v0
            // is just a no-op without LSDA), so we still check for personality
            // function equivalence to handle that case.
-           cuEntries[*foldBegin].personality ==
-               cuEntries[*foldEnd].personality &&
-           canFoldEncoding(cuEntries[*foldEnd].encoding))
+           foldBegin->personality == foldEnd->personality &&
+           canFoldEncoding(foldEnd->encoding))
       ;
     *foldWrite++ = *foldBegin;
     foldBegin = foldEnd;
   }
-  cuIndices.erase(foldWrite, cuIndices.end());
+  cuEntries.erase(foldWrite, cuEntries.end());
 
   encodePersonalities();
 
   // Count frequencies of the folded encodings
   EncodingMap encodingFrequencies;
-  for (size_t idx : cuIndices)
-    encodingFrequencies[cuEntries[idx].encoding]++;
+  for (const CompactUnwindEntry &cu : cuEntries)
+    encodingFrequencies[cu.encoding]++;
 
   // Make a vector of encodings, sorted by descending frequency
   for (const auto &frequency : encodingFrequencies)
@@ -558,21 +551,19 @@ void UnwindInfoSectionImpl::finalize() {
   //     and 127..255 references a local per-second-level-page table.
   // First we try the compact format and determine how many entries fit.
   // If more entries fit in the regular format, we use that.
-  for (size_t i = 0; i < cuIndices.size();) {
-    size_t idx = cuIndices[i];
+  for (size_t i = 0; i < cuEntries.size();) {
     secondLevelPages.emplace_back();
     SecondLevelPage &page = secondLevelPages.back();
     page.entryIndex = i;
     uint64_t functionAddressMax =
-        cuEntries[idx].functionAddress + COMPRESSED_ENTRY_FUNC_OFFSET_MASK;
+        cuEntries[i].functionAddress + COMPRESSED_ENTRY_FUNC_OFFSET_MASK;
     size_t n = commonEncodings.size();
     size_t wordsRemaining =
         SECOND_LEVEL_PAGE_WORDS -
         sizeof(unwind_info_compressed_second_level_page_header) /
             sizeof(uint32_t);
-    while (wordsRemaining >= 1 && i < cuIndices.size()) {
-      idx = cuIndices[i];
-      const CompactUnwindEntry *cuPtr = &cuEntries[idx];
+    while (wordsRemaining >= 1 && i < cuEntries.size()) {
+      const CompactUnwindEntry *cuPtr = &cuEntries[i];
       if (cuPtr->functionAddress >= functionAddressMax)
         break;
       if (commonEncodingIndexes.count(cuPtr->encoding) ||
@@ -593,21 +584,21 @@ void UnwindInfoSectionImpl::finalize() {
     // If this is not the final page, see if it's possible to fit more entries
     // by using the regular format. This can happen when there are many unique
     // encodings, and we saturated the local encoding table early.
-    if (i < cuIndices.size() &&
+    if (i < cuEntries.size() &&
         page.entryCount < REGULAR_SECOND_LEVEL_ENTRIES_MAX) {
       page.kind = UNWIND_SECOND_LEVEL_REGULAR;
       page.entryCount = std::min(REGULAR_SECOND_LEVEL_ENTRIES_MAX,
-                                 cuIndices.size() - page.entryIndex);
+                                 cuEntries.size() - page.entryIndex);
       i = page.entryIndex + page.entryCount;
     } else {
       page.kind = UNWIND_SECOND_LEVEL_COMPRESSED;
     }
   }
 
-  for (size_t idx : cuIndices) {
-    lsdaIndex[idx] = entriesWithLsda.size();
-    if (cuEntries[idx].lsda)
-      entriesWithLsda.push_back(idx);
+  for (size_t i = 0; i < cuEntries.size(); ++i) {
+    lsdaIndex[i] = entriesWithLsda.size();
+    if (cuEntries[i].lsda)
+      entriesWithLsda.push_back(i);
   }
 
   // compute size of __TEXT,__unwind_info section
@@ -626,7 +617,7 @@ void UnwindInfoSectionImpl::finalize() {
 // All inputs are relocated and output addresses are known, so write!
 
 void UnwindInfoSectionImpl::writeTo(uint8_t *buf) const {
-  assert(!cuIndices.empty() && "call only if there is unwind info");
+  assert(!cuEntries.empty() && "call only if there is unwind info");
 
   // section header
   auto *uip = reinterpret_cast<unwind_info_section_header *>(buf);
@@ -660,7 +651,7 @@ void UnwindInfoSectionImpl::writeTo(uint8_t *buf) const {
   uint64_t l2PagesOffset = level2PagesOffset;
   auto *iep = reinterpret_cast<unwind_info_section_header_index_entry *>(i32p);
   for (const SecondLevelPage &page : secondLevelPages) {
-    size_t idx = cuIndices[page.entryIndex];
+    size_t idx = page.entryIndex;
     iep->functionOffset = cuEntries[idx].functionAddress - in.header->addr;
     iep->secondLevelPagesSectionOffset = l2PagesOffset;
     iep->lsdaIndexArraySectionOffset =
@@ -695,7 +686,7 @@ void UnwindInfoSectionImpl::writeTo(uint8_t *buf) const {
   for (const SecondLevelPage &page : secondLevelPages) {
     if (page.kind == UNWIND_SECOND_LEVEL_COMPRESSED) {
       uintptr_t functionAddressBase =
-          cuEntries[cuIndices[page.entryIndex]].functionAddress;
+          cuEntries[page.entryIndex].functionAddress;
       auto *p2p =
           reinterpret_cast<unwind_info_compressed_second_level_page_header *>(
               pp);
@@ -708,8 +699,7 @@ void UnwindInfoSectionImpl::writeTo(uint8_t *buf) const {
       p2p->encodingsCount = page.localEncodings.size();
       auto *ep = reinterpret_cast<uint32_t *>(&p2p[1]);
       for (size_t i = 0; i < page.entryCount; i++) {
-        const CompactUnwindEntry &cue =
-            cuEntries[cuIndices[page.entryIndex + i]];
+        const CompactUnwindEntry &cue = cuEntries[page.entryIndex + i];
         auto it = commonEncodingIndexes.find(cue.encoding);
         if (it == commonEncodingIndexes.end())
           it = page.localEncodingIndexes.find(cue.encoding);
@@ -728,8 +718,7 @@ void UnwindInfoSectionImpl::writeTo(uint8_t *buf) const {
       p2p->entryCount = page.entryCount;
       auto *ep = reinterpret_cast<uint32_t *>(&p2p[1]);
       for (size_t i = 0; i < page.entryCount; i++) {
-        const CompactUnwindEntry &cue =
-            cuEntries[cuIndices[page.entryIndex + i]];
+        const CompactUnwindEntry &cue = cuEntries[page.entryIndex + i];
         *ep++ = cue.functionAddress;
         *ep++ = cue.encoding;
       }