47 files changed, 2055 insertions, 242 deletions
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index 90bae77..6fb2807 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -59,6 +59,11 @@ INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(LazyValueInfoWrapperPass, "lazy-value-info",
                 "Lazy Value Information Analysis", false, true)
 
+static cl::opt<bool> PerPredRanges(
+    "lvi-per-pred-ranges", cl::Hidden, cl::init(false),
+    cl::desc("Enable tracking of ranges for a value in a block for"
+             "each block predecessor (default = false)"));
+
 namespace llvm {
 FunctionPass *createLazyValueInfoPass() {
   return new LazyValueInfoWrapperPass();
@@ -103,6 +108,10 @@ namespace {
 
 namespace {
 using NonNullPointerSet = SmallDenseSet<AssertingVH<Value>, 2>;
+using BBLatticeElementMap =
+    SmallDenseMap<PoisoningVH<BasicBlock>, ValueLatticeElement, 4>;
+using PredecessorValueLatticeMap =
+    SmallDenseMap<AssertingVH<Value>, BBLatticeElementMap, 2>;
 
 /// This is the cache kept by LazyValueInfo which
 /// maintains information about queries across the clients' queries.
@@ -117,6 +126,10 @@ class LazyValueInfoCache {
     // std::nullopt indicates that the nonnull pointers for this basic block
     // block have not been computed yet.
     std::optional<NonNullPointerSet> NonNullPointers;
+    // This is an extension of the above LatticeElements, caching, for each
+    // Value, a ValueLatticeElement, for each predecessor of the BB tracked by
+    // this entry.
+    std::optional<PredecessorValueLatticeMap> PredecessorLatticeElements;
   };
 
   /// Cached information per basic block.
@@ -134,8 +147,14 @@ class LazyValueInfoCache {
 
   BlockCacheEntry *getOrCreateBlockEntry(BasicBlock *BB) {
     auto It = BlockCache.find_as(BB);
-    if (It == BlockCache.end())
-      It = BlockCache.insert({BB, std::make_unique<BlockCacheEntry>()}).first;
+    if (It == BlockCache.end()) {
+      std::unique_ptr<BlockCacheEntry> BCE =
+          std::make_unique<BlockCacheEntry>();
+      if (PerPredRanges)
+        BCE->PredecessorLatticeElements =
+            std::make_optional<PredecessorValueLatticeMap>();
+      It = BlockCache.insert({BB, std::move(BCE)}).first;
+    }
 
     return It->second.get();
   }
@@ -161,6 +180,28 @@ public:
     addValueHandle(Val);
   }
 
+  void insertPredecessorResults(Value *Val, BasicBlock *BB,
+                                BBLatticeElementMap &PredLatticeElements) {
+    BlockCacheEntry *Entry = getOrCreateBlockEntry(BB);
+
+    Entry->PredecessorLatticeElements->insert({Val, PredLatticeElements});
+
+    addValueHandle(Val);
+  }
+
+  std::optional<BBLatticeElementMap>
+  getCachedPredecessorInfo(Value *V, BasicBlock *BB) const {
+    const BlockCacheEntry *Entry = getBlockEntry(BB);
+    if (!Entry)
+      return std::nullopt;
+
+    auto LatticeIt = Entry->PredecessorLatticeElements->find_as(V);
+    if (LatticeIt == Entry->PredecessorLatticeElements->end())
+      return std::nullopt;
+
+    return LatticeIt->second;
+  }
+
   std::optional<ValueLatticeElement> getCachedValueInfo(Value *V,
                                                         BasicBlock *BB) const {
     const BlockCacheEntry *Entry = getBlockEntry(BB);
@@ -216,6 +257,8 @@ void LazyValueInfoCache::eraseValue(Value *V) {
     Pair.second->OverDefined.erase(V);
     if (Pair.second->NonNullPointers)
       Pair.second->NonNullPointers->erase(V);
+    if (PerPredRanges)
+      Pair.second->PredecessorLatticeElements->erase(V);
   }
 
   auto HandleIt = ValueHandles.find_as(V);
@@ -230,6 +273,10 @@ void LVIValueHandle::deleted() {
 }
 
 void LazyValueInfoCache::eraseBlock(BasicBlock *BB) {
+  // Clear all when a BB is removed.
+  if (PerPredRanges)
+    for (auto &Pair : BlockCache)
+      Pair.second->PredecessorLatticeElements->clear();
   BlockCache.erase(BB);
 }
 
@@ -691,6 +738,9 @@ LazyValueInfoImpl::solveBlockValueNonLocal(Value *Val, BasicBlock *BB) {
   // find a path to function entry.  TODO: We should consider explicitly
   // canonicalizing to make this true rather than relying on this happy
   // accident.
+  std::optional<BBLatticeElementMap> PredLatticeElements;
+  if (PerPredRanges)
+    PredLatticeElements = std::make_optional<BBLatticeElementMap>();
   for (BasicBlock *Pred : predecessors(BB)) {
     // Skip self loops.
     if (Pred == BB)
@@ -710,8 +760,13 @@ LazyValueInfoImpl::solveBlockValueNonLocal(Value *Val, BasicBlock *BB) {
                         << Pred->getName() << "' (non local).\n");
       return Result;
     }
+    if (PerPredRanges)
+      PredLatticeElements->insert({Pred, *EdgeResult});
   }
 
+  if (PerPredRanges)
+    TheCache.insertPredecessorResults(Val, BB, *PredLatticeElements);
+
   // Return the merged value, which is more precise than 'overdefined'.
   assert(!Result.isOverdefined());
   return Result;
@@ -724,6 +779,9 @@ LazyValueInfoImpl::solveBlockValuePHINode(PHINode *PN, BasicBlock *BB) {
   // Loop over all of our predecessors, merging what we know from them into
   // result.  See the comment about the chosen traversal order in
   // solveBlockValueNonLocal; the same reasoning applies here.
+  std::optional<BBLatticeElementMap> PredLatticeElements;
+  if (PerPredRanges)
+    PredLatticeElements = std::make_optional<BBLatticeElementMap>();
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
     BasicBlock *PhiBB = PN->getIncomingBlock(i);
     Value *PhiVal = PN->getIncomingValue(i);
@@ -746,8 +804,14 @@ LazyValueInfoImpl::solveBlockValuePHINode(PHINode *PN, BasicBlock *BB) {
 
       return Result;
     }
+
+    if (PerPredRanges)
+      PredLatticeElements->insert({PhiBB, *EdgeResult});
   }
 
+  if (PerPredRanges)
+    TheCache.insertPredecessorResults(PN, BB, *PredLatticeElements);
+
   // Return the merged value, which is more precise than 'overdefined'.
   assert(!Result.isOverdefined() && "Possible PHI in entry block?");
   return Result;
@@ -1002,7 +1066,77 @@ LazyValueInfoImpl::solveBlockValueBinaryOpImpl(
 
   const ConstantRange &LHSRange = *LHSRes;
   const ConstantRange &RHSRange = *RHSRes;
-  return ValueLatticeElement::getRange(OpFn(LHSRange, RHSRange));
+
+  std::optional<ValueLatticeElement> MergedResult =
+      ValueLatticeElement::getRange(OpFn(LHSRange, RHSRange));
+
+  if (!PerPredRanges)
+    return MergedResult;
+
+  std::optional<BBLatticeElementMap> PredLHS =
+      TheCache.getCachedPredecessorInfo(LHS, BB);
+  if (!PredLHS)
+    return MergedResult;
+  std::optional<BBLatticeElementMap> PredRHS =
+      TheCache.getCachedPredecessorInfo(RHS, BB);
+  if (!PredRHS)
+    return MergedResult;
+
+  const BBLatticeElementMap &LHSPredMap = *PredLHS;
+  const BBLatticeElementMap &RHSPredMap = *PredRHS;
+
+  BBLatticeElementMap PredLatticeElements;
+  ValueLatticeElement OverallPredResult;
+  for (auto *Pred : predecessors(BB)) {
+    auto LHSIt = LHSPredMap.find_as(Pred);
+    if (LHSIt == LHSPredMap.end())
+      return MergedResult;
+    const ValueLatticeElement &LHSFromPred = LHSIt->second;
+    std::optional<ConstantRange> LHSFromPredRes =
+        LHSFromPred.asConstantRange(LHS->getType());
+    if (!LHSFromPredRes)
+      return MergedResult;
+
+    auto RHSIt = RHSPredMap.find_as(Pred);
+    if (RHSIt == RHSPredMap.end())
+      return MergedResult;
+    const ValueLatticeElement &RHSFromPred = RHSIt->second;
+    std::optional<ConstantRange> RHSFromPredRes =
+        RHSFromPred.asConstantRange(RHS->getType());
+    if (!RHSFromPredRes)
+      return MergedResult;
+
+    const ConstantRange &LHSFromPredRange = *LHSFromPredRes;
+    const ConstantRange &RHSFromPredRange = *RHSFromPredRes;
+    std::optional<ValueLatticeElement> PredResult =
+        ValueLatticeElement::getRange(OpFn(LHSFromPredRange, RHSFromPredRange));
+    if (!PredResult)
+      return MergedResult;
+    if (PredResult->isOverdefined()) {
+      LLVM_DEBUG(
+          dbgs() << " pred BB '" << Pred->getName() << "' for BB '"
+                 << BB->getName()
+                 << "' overdefined. Discarding all predecessor intervals.\n");
+      return MergedResult;
+    }
+    PredLatticeElements.insert({Pred, *PredResult});
+    OverallPredResult.mergeIn(*PredResult);
+  }
+
+  // If this point is reached, all predecessors for both LHS and RHS have
+  // constant ranges previously computed. Can cache result and use the
+  // OverallPredResult;
+  TheCache.insertPredecessorResults(I, BB, PredLatticeElements);
+
+  LLVM_DEBUG(dbgs() << " Using predecessor intervals, evaluated " << *I
+                    << " to: " << OverallPredResult << ".\n");
+
+  if (!MergedResult)
+    return OverallPredResult;
+
+  LLVM_DEBUG(dbgs() << " Intersecting intervals for " << *I << ": "
+                    << OverallPredResult << " and  " << MergedResult << ".\n");
+  return MergedResult->intersect(OverallPredResult);
 }
 
 std::optional<ValueLatticeElement>
diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt
index 6ed724b..7ae5f7e 100644
--- a/llvm/lib/CAS/CMakeLists.txt
+++ b/llvm/lib/CAS/CMakeLists.txt
@@ -2,14 +2,19 @@ add_llvm_component_library(LLVMCAS
   ActionCache.cpp
   ActionCaches.cpp
   BuiltinCAS.cpp
+  DatabaseFile.cpp
   InMemoryCAS.cpp
   MappedFileRegionArena.cpp
   ObjectStore.cpp
   OnDiskCommon.cpp
+  OnDiskTrieRawHashMap.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/CAS
 
+  LINK_LIBS
+  ${LLVM_PTHREAD_LIB}
+
   LINK_COMPONENTS
   Support
 )
diff --git a/llvm/lib/CAS/DatabaseFile.cpp b/llvm/lib/CAS/DatabaseFile.cpp
new file mode 100644
index 0000000..db8ce1d
--- /dev/null
+++ b/llvm/lib/CAS/DatabaseFile.cpp
@@ -0,0 +1,123 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file This file implements the common abstractions for CAS database file.
+///
+//===----------------------------------------------------------------------===//
+
+#include "DatabaseFile.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+Error ondisk::createTableConfigError(std::errc ErrC, StringRef Path,
+                                     StringRef TableName, const Twine &Msg) {
+  return createStringError(make_error_code(ErrC),
+                           Path + "[" + TableName + "]: " + Msg);
+}
+
+Error ondisk::checkTable(StringRef Label, size_t Expected, size_t Observed,
+                         StringRef Path, StringRef TrieName) {
+  if (Expected == Observed)
+    return Error::success();
+  return createTableConfigError(std::errc::invalid_argument, Path, TrieName,
+                                "mismatched " + Label +
+                                    " (expected: " + Twine(Expected) +
+                                    ", observed: " + Twine(Observed) + ")");
+}
+
+Expected<DatabaseFile>
+DatabaseFile::create(const Twine &Path, uint64_t Capacity,
+                     function_ref<Error(DatabaseFile &)> NewDBConstructor) {
+  // Constructor for if the file doesn't exist.
+  auto NewFileConstructor = [&](MappedFileRegionArena &Alloc) -> Error {
+    if (Alloc.capacity() <
+        sizeof(Header) + sizeof(MappedFileRegionArena::Header))
+      return createTableConfigError(std::errc::argument_out_of_domain,
+                                    Path.str(), "datafile",
+                                    "Allocator too small for header");
+    (void)new (Alloc.data()) Header{getMagic(), getVersion(), {0}};
+    DatabaseFile DB(Alloc);
+    return NewDBConstructor(DB);
+  };
+
+  // Get or create the file.
+  MappedFileRegionArena Alloc;
+  if (Error E = MappedFileRegionArena::create(Path, Capacity, sizeof(Header),
+                                              NewFileConstructor)
+                    .moveInto(Alloc))
+    return std::move(E);
+
+  return DatabaseFile::get(
+      std::make_unique<MappedFileRegionArena>(std::move(Alloc)));
+}
+
+Error DatabaseFile::addTable(TableHandle Table) {
+  assert(Table);
+  assert(&Table.getRegion() == &getRegion());
+  int64_t ExistingRootOffset = 0;
+  const int64_t NewOffset =
+      reinterpret_cast<const char *>(&Table.getHeader()) - getRegion().data();
+  if (H->RootTableOffset.compare_exchange_strong(ExistingRootOffset, NewOffset))
+    return Error::success();
+
+  // Silently ignore attempts to set the root to itself.
+  if (ExistingRootOffset == NewOffset)
+    return Error::success();
+
+  // Return an proper error message.
+  TableHandle Root(getRegion(), ExistingRootOffset);
+  if (Root.getName() == Table.getName())
+    return createStringError(
+        make_error_code(std::errc::not_supported),
+        "collision with existing table of the same name '" + Table.getName() +
+            "'");
+
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "cannot add new table '" + Table.getName() +
+                               "'"
+                               " to existing root '" +
+                               Root.getName() + "'");
+}
+
+std::optional<TableHandle> DatabaseFile::findTable(StringRef Name) {
+  int64_t RootTableOffset = H->RootTableOffset.load();
+  if (!RootTableOffset)
+    return std::nullopt;
+
+  TableHandle Root(getRegion(), RootTableOffset);
+  if (Root.getName() == Name)
+    return Root;
+
+  return std::nullopt;
+}
+
+Error DatabaseFile::validate(MappedFileRegion &Region) {
+  if (Region.size() < sizeof(Header))
+    return createStringError(std::errc::invalid_argument,
+                             "database: missing header");
+
+  // Check the magic and version.
+  auto *H = reinterpret_cast<Header *>(Region.data());
+  if (H->Magic != getMagic())
+    return createStringError(std::errc::invalid_argument,
+                             "database: bad magic");
+  if (H->Version != getVersion())
+    return createStringError(std::errc::invalid_argument,
+                             "database: wrong version");
+
+  auto *MFH = reinterpret_cast<MappedFileRegionArena::Header *>(Region.data() +
+                                                                sizeof(Header));
+  // Check the bump-ptr, which should point past the header.
+  if (MFH->BumpPtr.load() < (int64_t)sizeof(Header))
+    return createStringError(std::errc::invalid_argument,
+                             "database: corrupt bump-ptr");
+
+  return Error::success();
+}
diff --git a/llvm/lib/CAS/DatabaseFile.h b/llvm/lib/CAS/DatabaseFile.h
new file mode 100644
index 0000000..609e5f13
--- /dev/null
+++ b/llvm/lib/CAS/DatabaseFile.h
@@ -0,0 +1,153 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file declares the common interface for a DatabaseFile that is used to
+/// implement OnDiskCAS.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CAS_DATABASEFILE_H
+#define LLVM_LIB_CAS_DATABASEFILE_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CAS/MappedFileRegionArena.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm::cas::ondisk {
+
+using MappedFileRegion = MappedFileRegionArena::RegionT;
+
+/// Generic handle for a table.
+///
+/// Generic table header layout:
+/// - 2-bytes: TableKind
+/// - 2-bytes: TableNameSize
+/// - 4-bytes: TableNameRelOffset (relative to header)
+class TableHandle {
+public:
+  enum class TableKind : uint16_t {
+    TrieRawHashMap = 1,
+    DataAllocator = 2,
+  };
+  struct Header {
+    TableKind Kind;
+    uint16_t NameSize;
+    int32_t NameRelOffset; ///< Relative to Header.
+  };
+
+  explicit operator bool() const { return H; }
+  const Header &getHeader() const { return *H; }
+  MappedFileRegion &getRegion() const { return *Region; }
+
+  template <class T> static void check() {
+    static_assert(
+        std::is_same<decltype(T::Header::GenericHeader), Header>::value,
+        "T::GenericHeader should be of type TableHandle::Header");
+    static_assert(offsetof(typename T::Header, GenericHeader) == 0,
+                  "T::GenericHeader must be the head of T::Header");
+  }
+  template <class T> bool is() const { return T::Kind == H->Kind; }
+  template <class T> T dyn_cast() const {
+    check<T>();
+    if (is<T>())
+      return T(*Region, *reinterpret_cast<typename T::Header *>(H));
+    return T();
+  }
+  template <class T> T cast() const {
+    assert(is<T>());
+    return dyn_cast<T>();
+  }
+
+  StringRef getName() const {
+    auto *Begin = reinterpret_cast<const char *>(H) + H->NameRelOffset;
+    return StringRef(Begin, H->NameSize);
+  }
+
+  TableHandle() = default;
+  TableHandle(MappedFileRegion &Region, Header &H) : Region(&Region), H(&H) {}
+  TableHandle(MappedFileRegion &Region, intptr_t HeaderOffset)
+      : TableHandle(Region,
+                    *reinterpret_cast<Header *>(Region.data() + HeaderOffset)) {
+  }
+
+private:
+  MappedFileRegion *Region = nullptr;
+  Header *H = nullptr;
+};
+
+/// Encapsulate a database file, which:
+/// - Sets/checks magic.
+/// - Sets/checks version.
+/// - Points at an arbitrary root table.
+/// - Sets up a MappedFileRegionArena for allocation.
+///
+/// Top-level layout:
+/// - 4-bytes: Magic
+/// - 4-bytes: Version
+/// - 8-bytes: RootTableOffset (16-bits: Kind; 48-bits: Offset)
+/// - 8-bytes: BumpPtr from MappedFileRegionArena
+class DatabaseFile {
+public:
+  static constexpr uint32_t getMagic() { return 0xDA7ABA53UL; }
+  static constexpr uint32_t getVersion() { return 1UL; }
+  struct Header {
+    uint32_t Magic;
+    uint32_t Version;
+    std::atomic<int64_t> RootTableOffset;
+  };
+
+  const Header &getHeader() { return *H; }
+  MappedFileRegionArena &getAlloc() { return Alloc; }
+  MappedFileRegion &getRegion() { return Alloc.getRegion(); }
+
+  /// Add a table. This is currently not thread safe and should be called inside
+  /// NewDBConstructor.
+  Error addTable(TableHandle Table);
+
+  /// Find a table. May return null.
+  std::optional<TableHandle> findTable(StringRef Name);
+
+  /// Create the DatabaseFile at Path with Capacity.
+  static Expected<DatabaseFile>
+  create(const Twine &Path, uint64_t Capacity,
+         function_ref<Error(DatabaseFile &)> NewDBConstructor);
+
+  size_t size() const { return Alloc.size(); }
+
+private:
+  static Expected<DatabaseFile>
+  get(std::unique_ptr<MappedFileRegionArena> Alloc) {
+    if (Error E = validate(Alloc->getRegion()))
+      return std::move(E);
+    return DatabaseFile(std::move(Alloc));
+  }
+
+  static Error validate(MappedFileRegion &Region);
+
+  DatabaseFile(MappedFileRegionArena &Alloc)
+      : H(reinterpret_cast<Header *>(Alloc.data())), Alloc(Alloc) {}
+  DatabaseFile(std::unique_ptr<MappedFileRegionArena> Alloc)
+      : DatabaseFile(*Alloc) {
+    OwnedAlloc = std::move(Alloc);
+  }
+
+  Header *H = nullptr;
+  MappedFileRegionArena &Alloc;
+  std::unique_ptr<MappedFileRegionArena> OwnedAlloc;
+};
+
+Error createTableConfigError(std::errc ErrC, StringRef Path,
+                             StringRef TableName, const Twine &Msg);
+
+Error checkTable(StringRef Label, size_t Expected, size_t Observed,
+                 StringRef Path, StringRef TrieName);
+
+} // namespace llvm::cas::ondisk
+
+#endif
diff --git a/llvm/lib/CAS/InMemoryCAS.cpp b/llvm/lib/CAS/InMemoryCAS.cpp
index 255b89c..c63ee70d 100644
--- a/llvm/lib/CAS/InMemoryCAS.cpp
+++ b/llvm/lib/CAS/InMemoryCAS.cpp
@@ -57,6 +57,9 @@ public:
   InMemoryObject() = delete;
   InMemoryObject(InMemoryObject &&) = delete;
   InMemoryObject(const InMemoryObject &) = delete;
+  InMemoryObject &operator=(const InMemoryObject &) = delete;
+  InMemoryObject &operator=(InMemoryObject &&) = delete;
+  virtual ~InMemoryObject() = default;
 
 protected:
   InMemoryObject(Kind K, const InMemoryIndexValueT &I) : IndexAndKind(&I, K) {}
diff --git a/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp b/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
new file mode 100644
index 0000000..9b382dd7
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
@@ -0,0 +1,1178 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file Implements OnDiskTrieRawHashMap.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskTrieRawHashMap.h"
+#include "DatabaseFile.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/TrieHashIndexGenerator.h"
+#include "llvm/CAS/MappedFileRegionArena.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/ThreadPool.h"
+#include "llvm/Support/Threading.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+#if LLVM_ENABLE_ONDISK_CAS
+
+//===----------------------------------------------------------------------===//
+// TrieRawHashMap data structures.
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class SubtrieHandle;
+class TrieRawHashMapHandle;
+class TrieVisitor;
+
+/// A value stored in the slots inside a SubTrie. A stored value can either be a
+/// subtrie (encoded after negation) which is the file offset to another
+/// subtrie, or it can be a fileset to a DataRecord.
+class SubtrieSlotValue {
+public:
+  explicit operator bool() const { return !isEmpty(); }
+  bool isEmpty() const { return !Offset; }
+  bool isData() const { return Offset > 0; }
+  bool isSubtrie() const { return Offset < 0; }
+  uint64_t asData() const {
+    assert(isData());
+    return Offset;
+  }
+  uint64_t asSubtrie() const {
+    assert(isSubtrie());
+    return -Offset;
+  }
+
+  FileOffset asSubtrieFileOffset() const { return FileOffset(asSubtrie()); }
+
+  FileOffset asDataFileOffset() const { return FileOffset(asData()); }
+
+  int64_t getRawOffset() const { return Offset; }
+
+  static SubtrieSlotValue getDataOffset(int64_t Offset) {
+    return SubtrieSlotValue(Offset);
+  }
+
+  static SubtrieSlotValue getSubtrieOffset(int64_t Offset) {
+    return SubtrieSlotValue(-Offset);
+  }
+
+  static SubtrieSlotValue getDataOffset(FileOffset Offset) {
+    return getDataOffset(Offset.get());
+  }
+
+  static SubtrieSlotValue getSubtrieOffset(FileOffset Offset) {
+    return getDataOffset(Offset.get());
+  }
+
+  static SubtrieSlotValue getFromSlot(std::atomic<int64_t> &Slot) {
+    return SubtrieSlotValue(Slot.load());
+  }
+
+  SubtrieSlotValue() = default;
+
+private:
+  friend class SubtrieHandle;
+  explicit SubtrieSlotValue(int64_t Offset) : Offset(Offset) {}
+  int64_t Offset = 0;
+};
+
+/// Subtrie layout:
+/// - 2-bytes: StartBit
+/// - 1-bytes: NumBits=lg(num-slots)
+/// - 5-bytes: 0-pad
+/// - <slots>
+class SubtrieHandle {
+public:
+  struct Header {
+    /// The bit this subtrie starts on.
+    uint16_t StartBit;
+
+    /// The number of bits this subtrie handles. It has 2^NumBits slots.
+    uint8_t NumBits;
+
+    /// 0-pad to 8B.
+    uint8_t ZeroPad1B;
+    uint32_t ZeroPad4B;
+  };
+
+  /// Slot storage:
+  /// - zero:     Empty
+  /// - positive: RecordOffset
+  /// - negative: SubtrieOffset
+  using SlotT = std::atomic<int64_t>;
+
+  static int64_t getSlotsSize(uint32_t NumBits) {
+    return sizeof(int64_t) * (1u << NumBits);
+  }
+
+  static int64_t getSize(uint32_t NumBits) {
+    return sizeof(SubtrieHandle::Header) + getSlotsSize(NumBits);
+  }
+
+  int64_t getSize() const { return getSize(H->NumBits); }
+  size_t getNumSlots() const { return Slots.size(); }
+
+  SubtrieSlotValue load(size_t I) const {
+    return SubtrieSlotValue(Slots[I].load());
+  }
+  void store(size_t I, SubtrieSlotValue V) {
+    return Slots[I].store(V.getRawOffset());
+  }
+
+  void printHash(raw_ostream &OS, ArrayRef<uint8_t> Bytes) const;
+
+  /// Return None on success, or the existing offset on failure.
+  bool compare_exchange_strong(size_t I, SubtrieSlotValue &Expected,
+                               SubtrieSlotValue New) {
+    return Slots[I].compare_exchange_strong(Expected.Offset, New.Offset);
+  }
+
+  /// Sink \p V from \p I in this subtrie down to \p NewI in a new subtrie with
+  /// \p NumSubtrieBits.
+  ///
+  /// \p UnusedSubtrie maintains a 1-item "free" list of unused subtries. If a
+  /// new subtrie is created that isn't used because of a lost race, then it If
+  /// it's already valid, it should be used instead of allocating a new one.
+  /// should be returned as an out parameter to be passed back in the future.
+  /// If it's already valid, it should be used instead of allocating a new one.
+  ///
+  /// Returns the subtrie that now lives at \p I.
+  Expected<SubtrieHandle> sink(size_t I, SubtrieSlotValue V,
+                               MappedFileRegionArena &Alloc,
+                               size_t NumSubtrieBits,
+                               SubtrieHandle &UnusedSubtrie, size_t NewI);
+
+  /// Only safe if the subtrie is empty.
+  void reinitialize(uint32_t StartBit, uint32_t NumBits);
+
+  SubtrieSlotValue getOffset() const {
+    return SubtrieSlotValue::getSubtrieOffset(
+        reinterpret_cast<const char *>(H) - Region->data());
+  }
+
+  FileOffset getFileOffset() const { return getOffset().asSubtrieFileOffset(); }
+
+  explicit operator bool() const { return H; }
+
+  Header &getHeader() const { return *H; }
+  uint32_t getStartBit() const { return H->StartBit; }
+  uint32_t getNumBits() const { return H->NumBits; }
+
+  static Expected<SubtrieHandle> create(MappedFileRegionArena &Alloc,
+                                        uint32_t StartBit, uint32_t NumBits);
+
+  static SubtrieHandle getFromFileOffset(MappedFileRegion &Region,
+                                         FileOffset Offset) {
+    return SubtrieHandle(Region, SubtrieSlotValue::getSubtrieOffset(Offset));
+  }
+
+  SubtrieHandle() = default;
+  SubtrieHandle(MappedFileRegion &Region, Header &H)
+      : Region(&Region), H(&H), Slots(getSlots(H)) {}
+  SubtrieHandle(MappedFileRegion &Region, SubtrieSlotValue Offset)
+      : SubtrieHandle(Region, *reinterpret_cast<Header *>(
+                                  Region.data() + Offset.asSubtrie())) {}
+
+private:
+  MappedFileRegion *Region = nullptr;
+  Header *H = nullptr;
+  MutableArrayRef<SlotT> Slots;
+
+  static MutableArrayRef<SlotT> getSlots(Header &H) {
+    return MutableArrayRef(reinterpret_cast<SlotT *>(&H + 1), 1u << H.NumBits);
+  }
+};
+
+/// Handle for a TrieRawHashMap table.
+///
+/// TrieRawHashMap table layout:
+/// - [8-bytes: Generic table header]
+/// - 1-byte:  NumSubtrieBits
+/// - 1-byte:  Flags (not used yet)
+/// - 2-bytes: NumHashBits
+/// - 4-bytes: RecordDataSize (in bytes)
+/// - 8-bytes: RootTrieOffset
+/// - 8-bytes: AllocatorOffset (reserved for implementing free lists)
+/// - <name> '\0'
+///
+/// Record layout:
+/// - <hash>
+/// - <data>
+class TrieRawHashMapHandle {
+public:
+  static constexpr TableHandle::TableKind Kind =
+      TableHandle::TableKind::TrieRawHashMap;
+
+  struct Header {
+    TableHandle::Header GenericHeader;
+    uint8_t NumSubtrieBits;
+    uint8_t Flags; ///< None used yet.
+    uint16_t NumHashBits;
+    uint32_t RecordDataSize;
+    std::atomic<int64_t> RootTrieOffset;
+    std::atomic<int64_t> AllocatorOffset;
+  };
+
+  operator TableHandle() const {
+    if (!H)
+      return TableHandle();
+    return TableHandle(*Region, H->GenericHeader);
+  }
+
+  struct RecordData {
+    OnDiskTrieRawHashMap::ValueProxy Proxy;
+    SubtrieSlotValue Offset;
+    FileOffset getFileOffset() const { return Offset.asDataFileOffset(); }
+  };
+
+  enum Limits : size_t {
+    /// Seems like 65528 hash bits ought to be enough.
+    MaxNumHashBytes = UINT16_MAX >> 3,
+    MaxNumHashBits = MaxNumHashBytes << 3,
+
+    /// 2^16 bits in a trie is 65536 slots. This restricts us to a 16-bit
+    /// index. This many slots is suspicously large anyway.
+    MaxNumRootBits = 16,
+
+    /// 2^10 bits in a trie is 1024 slots. This many slots seems suspiciously
+    /// large for subtries.
+    MaxNumSubtrieBits = 10,
+  };
+
+  static constexpr size_t getNumHashBytes(size_t NumHashBits) {
+    assert(NumHashBits % 8 == 0);
+    return NumHashBits / 8;
+  }
+  static constexpr size_t getRecordSize(size_t RecordDataSize,
+                                        size_t NumHashBits) {
+    return RecordDataSize + getNumHashBytes(NumHashBits);
+  }
+
+  RecordData getRecord(SubtrieSlotValue Offset);
+  Expected<RecordData> createRecord(MappedFileRegionArena &Alloc,
+                                    ArrayRef<uint8_t> Hash);
+
+  explicit operator bool() const { return H; }
+  const Header &getHeader() const { return *H; }
+  SubtrieHandle getRoot() const;
+  Expected<SubtrieHandle> getOrCreateRoot(MappedFileRegionArena &Alloc);
+  MappedFileRegion &getRegion() const { return *Region; }
+
+  size_t getFlags() const { return H->Flags; }
+  size_t getNumSubtrieBits() const { return H->NumSubtrieBits; }
+  size_t getNumHashBits() const { return H->NumHashBits; }
+  size_t getNumHashBytes() const { return getNumHashBytes(H->NumHashBits); }
+  size_t getRecordDataSize() const { return H->RecordDataSize; }
+  size_t getRecordSize() const {
+    return getRecordSize(H->RecordDataSize, H->NumHashBits);
+  }
+
+  TrieHashIndexGenerator getIndexGen(SubtrieHandle Root,
+                                     ArrayRef<uint8_t> Hash) {
+    assert(Root.getStartBit() == 0);
+    assert(getNumHashBytes() == Hash.size());
+    assert(getNumHashBits() == Hash.size() * 8);
+    return TrieHashIndexGenerator{Root.getNumBits(), getNumSubtrieBits(), Hash};
+  }
+
+  static Expected<TrieRawHashMapHandle>
+  create(MappedFileRegionArena &Alloc, StringRef Name,
+         std::optional<uint64_t> NumRootBits, uint64_t NumSubtrieBits,
+         uint64_t NumHashBits, uint64_t RecordDataSize);
+
+  void
+  print(raw_ostream &OS,
+        function_ref<void(ArrayRef<char>)> PrintRecordData = nullptr) const;
+
+  Error validate(
+      function_ref<Error(FileOffset, OnDiskTrieRawHashMap::ConstValueProxy)>
+          RecordVerifier) const;
+  TrieRawHashMapHandle() = default;
+  TrieRawHashMapHandle(MappedFileRegion &Region, Header &H)
+      : Region(&Region), H(&H) {}
+  TrieRawHashMapHandle(MappedFileRegion &Region, intptr_t HeaderOffset)
+      : TrieRawHashMapHandle(
+            Region, *reinterpret_cast<Header *>(Region.data() + HeaderOffset)) {
+  }
+
+private:
+  MappedFileRegion *Region = nullptr;
+  Header *H = nullptr;
+};
+
+} // end anonymous namespace
+
+struct OnDiskTrieRawHashMap::ImplType {
+  DatabaseFile File;
+  TrieRawHashMapHandle Trie;
+};
+
+Expected<SubtrieHandle> SubtrieHandle::create(MappedFileRegionArena &Alloc,
+                                              uint32_t StartBit,
+                                              uint32_t NumBits) {
+  assert(StartBit <= TrieRawHashMapHandle::MaxNumHashBits);
+  assert(NumBits <= UINT8_MAX);
+  assert(NumBits <= TrieRawHashMapHandle::MaxNumRootBits);
+
+  auto Mem = Alloc.allocate(getSize(NumBits));
+  if (LLVM_UNLIKELY(!Mem))
+    return Mem.takeError();
+  auto *H =
+      new (*Mem) SubtrieHandle::Header{(uint16_t)StartBit, (uint8_t)NumBits,
+                                       /*ZeroPad1B=*/0, /*ZeroPad4B=*/0};
+  SubtrieHandle S(Alloc.getRegion(), *H);
+  for (auto I = S.Slots.begin(), E = S.Slots.end(); I != E; ++I)
+    new (I) SlotT(0);
+  return S;
+}
+
+SubtrieHandle TrieRawHashMapHandle::getRoot() const {
+  if (int64_t Root = H->RootTrieOffset)
+    return SubtrieHandle(getRegion(), SubtrieSlotValue::getSubtrieOffset(Root));
+  return SubtrieHandle();
+}
+
+Expected<SubtrieHandle>
+TrieRawHashMapHandle::getOrCreateRoot(MappedFileRegionArena &Alloc) {
+  assert(&Alloc.getRegion() == &getRegion());
+  if (SubtrieHandle Root = getRoot())
+    return Root;
+
+  int64_t Race = 0;
+  auto LazyRoot = SubtrieHandle::create(Alloc, 0, H->NumSubtrieBits);
+  if (LLVM_UNLIKELY(!LazyRoot))
+    return LazyRoot.takeError();
+  if (H->RootTrieOffset.compare_exchange_strong(
+          Race, LazyRoot->getOffset().asSubtrie()))
+    return *LazyRoot;
+
+  // There was a race. Return the other root.
+  //
+  // TODO: Avoid leaking the lazy root by storing it in an allocator.
+  return SubtrieHandle(getRegion(), SubtrieSlotValue::getSubtrieOffset(Race));
+}
+
+Expected<TrieRawHashMapHandle>
+TrieRawHashMapHandle::create(MappedFileRegionArena &Alloc, StringRef Name,
+                             std::optional<uint64_t> NumRootBits,
+                             uint64_t NumSubtrieBits, uint64_t NumHashBits,
+                             uint64_t RecordDataSize) {
+  // Allocate.
+  auto Offset = Alloc.allocateOffset(sizeof(Header) + Name.size() + 1);
+  if (LLVM_UNLIKELY(!Offset))
+    return Offset.takeError();
+
+  // Construct the header and the name.
+  assert(Name.size() <= UINT16_MAX && "Expected smaller table name");
+  assert(NumSubtrieBits <= UINT8_MAX && "Expected valid subtrie bits");
+  assert(NumHashBits <= UINT16_MAX && "Expected valid hash size");
+  assert(RecordDataSize <= UINT32_MAX && "Expected smaller table name");
+  auto *H = new (Alloc.getRegion().data() + *Offset)
+      Header{{TableHandle::TableKind::TrieRawHashMap, (uint16_t)Name.size(),
+              (uint32_t)sizeof(Header)},
+             (uint8_t)NumSubtrieBits,
+             /*Flags=*/0,
+             (uint16_t)NumHashBits,
+             (uint32_t)RecordDataSize,
+             /*RootTrieOffset=*/{0},
+             /*AllocatorOffset=*/{0}};
+  char *NameStorage = reinterpret_cast<char *>(H + 1);
+  llvm::copy(Name, NameStorage);
+  NameStorage[Name.size()] = 0;
+
+  // Construct a root trie, if requested.
+  TrieRawHashMapHandle Trie(Alloc.getRegion(), *H);
+  auto Sub = SubtrieHandle::create(Alloc, 0, *NumRootBits);
+  if (LLVM_UNLIKELY(!Sub))
+    return Sub.takeError();
+  if (NumRootBits)
+    H->RootTrieOffset = Sub->getOffset().asSubtrie();
+  return Trie;
+}
+
+TrieRawHashMapHandle::RecordData
+TrieRawHashMapHandle::getRecord(SubtrieSlotValue Offset) {
+  char *Begin = Region->data() + Offset.asData();
+  OnDiskTrieRawHashMap::ValueProxy Proxy;
+  Proxy.Data = MutableArrayRef(Begin, getRecordDataSize());
+  Proxy.Hash = ArrayRef(reinterpret_cast<const uint8_t *>(Proxy.Data.end()),
+                        getNumHashBytes());
+  return RecordData{Proxy, Offset};
+}
+
+Expected<TrieRawHashMapHandle::RecordData>
+TrieRawHashMapHandle::createRecord(MappedFileRegionArena &Alloc,
+                                   ArrayRef<uint8_t> Hash) {
+  assert(&Alloc.getRegion() == Region);
+  assert(Hash.size() == getNumHashBytes());
+  auto Offset = Alloc.allocateOffset(getRecordSize());
+  if (LLVM_UNLIKELY(!Offset))
+    return Offset.takeError();
+
+  RecordData Record = getRecord(SubtrieSlotValue::getDataOffset(*Offset));
+  llvm::copy(Hash, const_cast<uint8_t *>(Record.Proxy.Hash.begin()));
+  return Record;
+}
+
+Expected<OnDiskTrieRawHashMap::const_pointer>
+OnDiskTrieRawHashMap::recoverFromFileOffset(FileOffset Offset) const {
+  // Check alignment.
+  if (!isAligned(MappedFileRegionArena::getAlign(), Offset.get()))
+    return createStringError(make_error_code(std::errc::protocol_error),
+                             "unaligned file offset at 0x" +
+                                 utohexstr(Offset.get(), /*LowerCase=*/true));
+
+  // Check bounds.
+  //
+  // Note: There's no potential overflow when using \c uint64_t because Offset
+  // is in valid offset range and the record size is in \c [0,UINT32_MAX].
+  if (!validOffset(Offset) ||
+      Offset.get() + Impl->Trie.getRecordSize() > Impl->File.getAlloc().size())
+    return createStringError(make_error_code(std::errc::protocol_error),
+                             "file offset too large: 0x" +
+                                 utohexstr(Offset.get(), /*LowerCase=*/true));
+
+  // Looks okay...
+  TrieRawHashMapHandle::RecordData D =
+      Impl->Trie.getRecord(SubtrieSlotValue::getDataOffset(Offset));
+  return const_pointer(D.Proxy, D.getFileOffset());
+}
+
+OnDiskTrieRawHashMap::const_pointer
+OnDiskTrieRawHashMap::find(ArrayRef<uint8_t> Hash) const {
+  TrieRawHashMapHandle Trie = Impl->Trie;
+  assert(Hash.size() == Trie.getNumHashBytes() && "Invalid hash");
+
+  SubtrieHandle S = Trie.getRoot();
+  if (!S)
+    return const_pointer();
+
+  TrieHashIndexGenerator IndexGen = Trie.getIndexGen(S, Hash);
+  size_t Index = IndexGen.next();
+  for (;;) {
+    // Try to set the content.
+    SubtrieSlotValue V = S.load(Index);
+    if (!V)
+      return const_pointer();
+
+    // Check for an exact match.
+    if (V.isData()) {
+      TrieRawHashMapHandle::RecordData D = Trie.getRecord(V);
+      return D.Proxy.Hash == Hash ? const_pointer(D.Proxy, D.getFileOffset())
+                                  : const_pointer();
+    }
+
+    Index = IndexGen.next();
+    S = SubtrieHandle(Trie.getRegion(), V);
+  }
+}
+
+/// Only safe if the subtrie is empty.
+void SubtrieHandle::reinitialize(uint32_t StartBit, uint32_t NumBits) {
+  assert(StartBit > H->StartBit);
+  assert(NumBits <= H->NumBits);
+  // Ideally would also assert that all slots are empty, but that's expensive.
+
+  H->StartBit = StartBit;
+  H->NumBits = NumBits;
+}
+
+Expected<OnDiskTrieRawHashMap::pointer>
+OnDiskTrieRawHashMap::insertLazy(ArrayRef<uint8_t> Hash,
+                                 LazyInsertOnConstructCB OnConstruct,
+                                 LazyInsertOnLeakCB OnLeak) {
+  TrieRawHashMapHandle Trie = Impl->Trie;
+  assert(Hash.size() == Trie.getNumHashBytes() && "Invalid hash");
+
+  MappedFileRegionArena &Alloc = Impl->File.getAlloc();
+  std::optional<SubtrieHandle> S;
+  auto Err = Trie.getOrCreateRoot(Alloc).moveInto(S);
+  if (LLVM_UNLIKELY(Err))
+    return std::move(Err);
+
+  TrieHashIndexGenerator IndexGen = Trie.getIndexGen(*S, Hash);
+  size_t Index = IndexGen.next();
+
+  // Walk through the hash bytes and insert into correct trie position.
+  std::optional<TrieRawHashMapHandle::RecordData> NewRecord;
+  SubtrieHandle UnusedSubtrie;
+  for (;;) {
+    SubtrieSlotValue Existing = S->load(Index);
+
+    // Try to set it, if it's empty.
+    if (!Existing) {
+      if (!NewRecord) {
+        auto Err = Trie.createRecord(Alloc, Hash).moveInto(NewRecord);
+        if (LLVM_UNLIKELY(Err))
+          return std::move(Err);
+        if (OnConstruct)
+          OnConstruct(NewRecord->Offset.asDataFileOffset(), NewRecord->Proxy);
+      }
+
+      if (S->compare_exchange_strong(Index, Existing, NewRecord->Offset))
+        return pointer(NewRecord->Proxy, NewRecord->Offset.asDataFileOffset());
+
+      // Race means that Existing is no longer empty; fall through...
+    }
+
+    if (Existing.isSubtrie()) {
+      S = SubtrieHandle(Trie.getRegion(), Existing);
+      Index = IndexGen.next();
+      continue;
+    }
+
+    // Check for an exact match.
+    TrieRawHashMapHandle::RecordData ExistingRecord = Trie.getRecord(Existing);
+    if (ExistingRecord.Proxy.Hash == Hash) {
+      if (NewRecord && OnLeak)
+        OnLeak(NewRecord->Offset.asDataFileOffset(), NewRecord->Proxy,
+               ExistingRecord.Offset.asDataFileOffset(), ExistingRecord.Proxy);
+      return pointer(ExistingRecord.Proxy,
+                     ExistingRecord.Offset.asDataFileOffset());
+    }
+
+    // Sink the existing content as long as the indexes match.
+    for (;;) {
+      size_t NextIndex = IndexGen.next();
+      size_t NewIndexForExistingContent =
+          IndexGen.getCollidingBits(ExistingRecord.Proxy.Hash);
+
+      auto Err = S->sink(Index, Existing, Alloc, IndexGen.getNumBits(),
+                         UnusedSubtrie, NewIndexForExistingContent)
+                     .moveInto(S);
+      if (LLVM_UNLIKELY(Err))
+        return std::move(Err);
+      Index = NextIndex;
+
+      // Found the difference.
+      if (NextIndex != NewIndexForExistingContent)
+        break;
+    }
+  }
+}
+
+Expected<SubtrieHandle> SubtrieHandle::sink(size_t I, SubtrieSlotValue V,
+                                            MappedFileRegionArena &Alloc,
+                                            size_t NumSubtrieBits,
+                                            SubtrieHandle &UnusedSubtrie,
+                                            size_t NewI) {
+  std::optional<SubtrieHandle> NewS;
+  if (UnusedSubtrie) {
+    // Steal UnusedSubtrie and initialize it.
+    NewS.emplace();
+    std::swap(*NewS, UnusedSubtrie);
+    NewS->reinitialize(getStartBit() + getNumBits(), NumSubtrieBits);
+  } else {
+    // Allocate a new, empty subtrie.
+    auto Err = SubtrieHandle::create(Alloc, getStartBit() + getNumBits(),
+                                     NumSubtrieBits)
+                   .moveInto(NewS);
+    if (LLVM_UNLIKELY(Err))
+      return std::move(Err);
+  }
+
+  NewS->store(NewI, V);
+  if (compare_exchange_strong(I, V, NewS->getOffset()))
+    return *NewS; // Success!
+
+  // Raced.
+  assert(V.isSubtrie() && "Expected racing sink() to add a subtrie");
+
+  // Wipe out the new slot so NewS can be reused and set the out parameter.
+  NewS->store(NewI, SubtrieSlotValue());
+  UnusedSubtrie = *NewS;
+
+  // Return the subtrie added by the concurrent sink() call.
+  return SubtrieHandle(Alloc.getRegion(), V);
+}
+
+void OnDiskTrieRawHashMap::print(
+    raw_ostream &OS, function_ref<void(ArrayRef<char>)> PrintRecordData) const {
+  Impl->Trie.print(OS, PrintRecordData);
+}
+
+Error OnDiskTrieRawHashMap::validate(
+    function_ref<Error(FileOffset, ConstValueProxy)> RecordVerifier) const {
+  return Impl->Trie.validate(RecordVerifier);
+}
+
+// Helper function that prints hexdigit and have a sub-byte starting position.
+static void printHexDigits(raw_ostream &OS, ArrayRef<uint8_t> Bytes,
+                           size_t StartBit, size_t NumBits) {
+  assert(StartBit % 4 == 0);
+  assert(NumBits % 4 == 0);
+  for (size_t I = StartBit, E = StartBit + NumBits; I != E; I += 4) {
+    uint8_t HexPair = Bytes[I / 8];
+    uint8_t HexDigit = I % 8 == 0 ? HexPair >> 4 : HexPair & 0xf;
+    OS << hexdigit(HexDigit, /*LowerCase=*/true);
+  }
+}
+
+static void printBits(raw_ostream &OS, ArrayRef<uint8_t> Bytes, size_t StartBit,
+                      size_t NumBits) {
+  assert(StartBit + NumBits <= Bytes.size() * 8u);
+  for (size_t I = StartBit, E = StartBit + NumBits; I != E; ++I) {
+    uint8_t Byte = Bytes[I / 8];
+    size_t ByteOffset = I % 8;
+    if (size_t ByteShift = 8 - ByteOffset - 1)
+      Byte >>= ByteShift;
+    OS << (Byte & 0x1 ? '1' : '0');
+  }
+}
+
+void SubtrieHandle::printHash(raw_ostream &OS, ArrayRef<uint8_t> Bytes) const {
+  // afb[1c:00*01110*0]def
+  size_t EndBit = getStartBit() + getNumBits();
+  size_t HashEndBit = Bytes.size() * 8u;
+
+  size_t FirstBinaryBit = getStartBit() & ~0x3u;
+  printHexDigits(OS, Bytes, 0, FirstBinaryBit);
+
+  size_t LastBinaryBit = (EndBit + 3u) & ~0x3u;
+  OS << "[";
+  printBits(OS, Bytes, FirstBinaryBit, LastBinaryBit - FirstBinaryBit);
+  OS << "]";
+
+  printHexDigits(OS, Bytes, LastBinaryBit, HashEndBit - LastBinaryBit);
+}
+
+static void appendIndexBits(std::string &Prefix, size_t Index,
+                            size_t NumSlots) {
+  std::string Bits;
+  for (size_t NumBits = 1u; NumBits < NumSlots; NumBits <<= 1) {
+    Bits.push_back('0' + (Index & 0x1));
+    Index >>= 1;
+  }
+  for (char Ch : llvm::reverse(Bits))
+    Prefix += Ch;
+}
+
+static void printPrefix(raw_ostream &OS, StringRef Prefix) {
+  while (Prefix.size() >= 4) {
+    uint8_t Digit;
+    bool ErrorParsingBinary = Prefix.take_front(4).getAsInteger(2, Digit);
+    assert(!ErrorParsingBinary);
+    (void)ErrorParsingBinary;
+    OS << hexdigit(Digit, /*LowerCase=*/true);
+    Prefix = Prefix.drop_front(4);
+  }
+  if (!Prefix.empty())
+    OS << "[" << Prefix << "]";
+}
+
+LLVM_DUMP_METHOD void OnDiskTrieRawHashMap::dump() const { print(dbgs()); }
+
+static Expected<size_t> checkParameter(StringRef Label, size_t Max,
+                                       std::optional<size_t> Value,
+                                       std::optional<size_t> Default,
+                                       StringRef Path, StringRef TableName) {
+  assert(Value || Default);
+  assert(!Default || *Default <= Max);
+  if (!Value)
+    return *Default;
+
+  if (*Value <= Max)
+    return *Value;
+  return createTableConfigError(
+      std::errc::argument_out_of_domain, Path, TableName,
+      "invalid " + Label + ": " + Twine(*Value) + " (max: " + Twine(Max) + ")");
+}
+
+size_t OnDiskTrieRawHashMap::size() const { return Impl->File.size(); }
+size_t OnDiskTrieRawHashMap::capacity() const {
+  return Impl->File.getRegion().size();
+}
+
+Expected<OnDiskTrieRawHashMap>
+OnDiskTrieRawHashMap::create(const Twine &PathTwine, const Twine &TrieNameTwine,
+                             size_t NumHashBits, uint64_t DataSize,
+                             uint64_t MaxFileSize,
+                             std::optional<uint64_t> NewFileInitialSize,
+                             std::optional<size_t> NewTableNumRootBits,
+                             std::optional<size_t> NewTableNumSubtrieBits) {
+  SmallString<128> PathStorage;
+  StringRef Path = PathTwine.toStringRef(PathStorage);
+  SmallString<128> TrieNameStorage;
+  StringRef TrieName = TrieNameTwine.toStringRef(TrieNameStorage);
+
+  constexpr size_t DefaultNumRootBits = 10;
+  constexpr size_t DefaultNumSubtrieBits = 6;
+
+  size_t NumRootBits;
+  if (Error E = checkParameter(
+                    "root bits", TrieRawHashMapHandle::MaxNumRootBits,
+                    NewTableNumRootBits, DefaultNumRootBits, Path, TrieName)
+                    .moveInto(NumRootBits))
+    return std::move(E);
+
+  size_t NumSubtrieBits;
+  if (Error E = checkParameter("subtrie bits",
+                               TrieRawHashMapHandle::MaxNumSubtrieBits,
+                               NewTableNumSubtrieBits, DefaultNumSubtrieBits,
+                               Path, TrieName)
+                    .moveInto(NumSubtrieBits))
+    return std::move(E);
+
+  size_t NumHashBytes = NumHashBits >> 3;
+  if (Error E =
+          checkParameter("hash size", TrieRawHashMapHandle::MaxNumHashBits,
+                         NumHashBits, std::nullopt, Path, TrieName)
+              .takeError())
+    return std::move(E);
+  assert(NumHashBits == NumHashBytes << 3 &&
+         "Expected hash size to be byte-aligned");
+  if (NumHashBits != NumHashBytes << 3)
+    return createTableConfigError(
+        std::errc::argument_out_of_domain, Path, TrieName,
+        "invalid hash size: " + Twine(NumHashBits) + " (not byte-aligned)");
+
+  // Constructor for if the file doesn't exist.
+  auto NewDBConstructor = [&](DatabaseFile &DB) -> Error {
+    auto Trie =
+        TrieRawHashMapHandle::create(DB.getAlloc(), TrieName, NumRootBits,
+                                     NumSubtrieBits, NumHashBits, DataSize);
+    if (LLVM_UNLIKELY(!Trie))
+      return Trie.takeError();
+
+    return DB.addTable(*Trie);
+  };
+
+  // Get or create the file.
+  Expected<DatabaseFile> File =
+      DatabaseFile::create(Path, MaxFileSize, NewDBConstructor);
+  if (!File)
+    return File.takeError();
+
+  // Find the trie and validate it.
+  std::optional<TableHandle> Table = File->findTable(TrieName);
+  if (!Table)
+    return createTableConfigError(std::errc::argument_out_of_domain, Path,
+                                  TrieName, "table not found");
+  if (Error E = checkTable("table kind", (size_t)TrieRawHashMapHandle::Kind,
+                           (size_t)Table->getHeader().Kind, Path, TrieName))
+    return std::move(E);
+  auto Trie = Table->cast<TrieRawHashMapHandle>();
+  assert(Trie && "Already checked the kind");
+
+  // Check the hash and data size.
+  if (Error E = checkTable("hash size", NumHashBits, Trie.getNumHashBits(),
+                           Path, TrieName))
+    return std::move(E);
+  if (Error E = checkTable("data size", DataSize, Trie.getRecordDataSize(),
+                           Path, TrieName))
+    return std::move(E);
+
+  // No flags supported right now. Either corrupt, or coming from a future
+  // writer.
+  if (size_t Flags = Trie.getFlags())
+    return createTableConfigError(std::errc::invalid_argument, Path, TrieName,
+                                  "unsupported flags: " + Twine(Flags));
+
+  // Success.
+  OnDiskTrieRawHashMap::ImplType Impl{DatabaseFile(std::move(*File)), Trie};
+  return OnDiskTrieRawHashMap(std::make_unique<ImplType>(std::move(Impl)));
+}
+
+static Error createInvalidTrieError(uint64_t Offset, const Twine &Msg) {
+  return createStringError(make_error_code(std::errc::protocol_error),
+                           "invalid trie at 0x" +
+                               utohexstr(Offset, /*LowerCase=*/true) + ": " +
+                               Msg);
+}
+
+//===----------------------------------------------------------------------===//
+// TrieVisitor data structures.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// A multi-threaded vistior to traverse the Trie.
+///
+/// TODO: add more sanity checks that isn't just plain data corruption. For
+/// example, some ill-formed data can be constructed to form a cycle using
+/// Sub-Tries and it can lead to inifinite loop when visiting (or inserting
+/// data).
+class TrieVisitor {
+public:
+  TrieVisitor(TrieRawHashMapHandle Trie, unsigned ThreadCount = 0,
+              unsigned ErrorLimit = 50)
+      : Trie(Trie), ErrorLimit(ErrorLimit),
+        Threads(hardware_concurrency(ThreadCount)) {}
+  virtual ~TrieVisitor() = default;
+  Error visit();
+
+private:
+  // Virtual method to implement the action when visiting a sub-trie.
+  virtual Error visitSubTrie(StringRef Prefix, SubtrieHandle SubTrie) {
+    return Error::success();
+  }
+
+  // Virtual method to implement the action when visiting a slot in a trie node.
+  virtual Error visitSlot(unsigned I, SubtrieHandle Subtrie, StringRef Prefix,
+                          SubtrieSlotValue Slot) {
+    return Error::success();
+  }
+
+protected:
+  TrieRawHashMapHandle Trie;
+
+private:
+  Error traverseTrieNode(SubtrieHandle Node, StringRef Prefix);
+
+  Error validateSubTrie(SubtrieHandle Node, bool IsRoot);
+
+  // Helper function to capture errors when visiting the trie nodes.
+  void addError(Error NewError) {
+    assert(NewError && "not an error");
+    std::lock_guard<std::mutex> ErrorLock(Lock);
+    if (NumError >= ErrorLimit) {
+      // Too many errors.
+      consumeError(std::move(NewError));
+      return;
+    }
+
+    if (Err)
+      Err = joinErrors(std::move(*Err), std::move(NewError));
+    else
+      Err = std::move(NewError);
+    NumError++;
+  }
+
+  bool tooManyErrors() {
+    std::lock_guard<std::mutex> ErrorLock(Lock);
+    return (bool)Err && NumError >= ErrorLimit;
+  }
+
+  const unsigned ErrorLimit;
+  std::optional<Error> Err;
+  unsigned NumError = 0;
+  std::mutex Lock;
+  DefaultThreadPool Threads;
+};
+
+/// A visitor that traverse and print the Trie.
+class TriePrinter : public TrieVisitor {
+public:
+  TriePrinter(TrieRawHashMapHandle Trie, raw_ostream &OS,
+              function_ref<void(ArrayRef<char>)> PrintRecordData)
+      : TrieVisitor(Trie, /*ThreadCount=*/1), OS(OS),
+        PrintRecordData(PrintRecordData) {}
+
+  Error printRecords() {
+    if (Records.empty())
+      return Error::success();
+
+    OS << "records\n";
+    llvm::sort(Records);
+    for (int64_t Offset : Records) {
+      TrieRawHashMapHandle::RecordData Record =
+          Trie.getRecord(SubtrieSlotValue::getDataOffset(Offset));
+      if (auto Err = printRecord(Record))
+        return Err;
+    }
+    return Error::success();
+  }
+
+  Error printRecord(TrieRawHashMapHandle::RecordData &Record) {
+    OS << "- addr=" << (void *)Record.getFileOffset().get() << " ";
+    if (PrintRecordData) {
+      PrintRecordData(Record.Proxy.Data);
+    } else {
+      OS << "bytes=";
+      ArrayRef<uint8_t> Data(
+          reinterpret_cast<const uint8_t *>(Record.Proxy.Data.data()),
+          Record.Proxy.Data.size());
+      printHexDigits(OS, Data, 0, Data.size() * 8);
+    }
+    OS << "\n";
+    return Error::success();
+  }
+
+  Error visitSubTrie(StringRef Prefix, SubtrieHandle SubTrie) override {
+    if (Prefix.empty()) {
+      OS << "root";
+    } else {
+      OS << "subtrie=";
+      printPrefix(OS, Prefix);
+    }
+
+    OS << " addr="
+       << (void *)(reinterpret_cast<const char *>(&SubTrie.getHeader()) -
+                   Trie.getRegion().data());
+    OS << " num-slots=" << SubTrie.getNumSlots() << "\n";
+    return Error::success();
+  }
+
+  Error visitSlot(unsigned I, SubtrieHandle Subtrie, StringRef Prefix,
+                  SubtrieSlotValue Slot) override {
+    OS << "- index=";
+    for (size_t Pad : {10, 100, 1000})
+      if (I < Pad && Subtrie.getNumSlots() >= Pad)
+        OS << "0";
+    OS << I << " ";
+    if (Slot.isSubtrie()) {
+      OS << "addr=" << (void *)Slot.asSubtrie();
+      OS << " subtrie=";
+      printPrefix(OS, Prefix);
+      OS << "\n";
+      return Error::success();
+    }
+    TrieRawHashMapHandle::RecordData Record = Trie.getRecord(Slot);
+    OS << "addr=" << (void *)Record.getFileOffset().get();
+    OS << " content=";
+    Subtrie.printHash(OS, Record.Proxy.Hash);
+    OS << "\n";
+    Records.push_back(Slot.asData());
+    return Error::success();
+  }
+
+private:
+  raw_ostream &OS;
+  function_ref<void(ArrayRef<char>)> PrintRecordData;
+  SmallVector<int64_t> Records;
+};
+
+/// TrieVerifier that adds additional verification on top of the basic visitor.
+class TrieVerifier : public TrieVisitor {
+public:
+  TrieVerifier(
+      TrieRawHashMapHandle Trie,
+      function_ref<Error(FileOffset, OnDiskTrieRawHashMap::ConstValueProxy)>
+          RecordVerifier)
+      : TrieVisitor(Trie), RecordVerifier(RecordVerifier) {}
+
+private:
+  Error visitSubTrie(StringRef Prefix, SubtrieHandle SubTrie) final {
+    return Error::success();
+  }
+
+  Error visitSlot(unsigned I, SubtrieHandle Subtrie, StringRef Prefix,
+                  SubtrieSlotValue Slot) final {
+    if (RecordVerifier && Slot.isData()) {
+      if (!isAligned(MappedFileRegionArena::getAlign(), Slot.asData()))
+        return createInvalidTrieError(Slot.asData(), "mis-aligned data entry");
+
+      TrieRawHashMapHandle::RecordData Record =
+          Trie.getRecord(SubtrieSlotValue::getDataOffset(Slot.asData()));
+      return RecordVerifier(Slot.asDataFileOffset(),
+                            OnDiskTrieRawHashMap::ConstValueProxy{
+                                Record.Proxy.Hash, Record.Proxy.Data});
+    }
+    return Error::success();
+  }
+
+  function_ref<Error(FileOffset, OnDiskTrieRawHashMap::ConstValueProxy)>
+      RecordVerifier;
+};
+} // namespace
+
+Error TrieVisitor::visit() {
+  auto Root = Trie.getRoot();
+  if (!Root)
+    return Error::success();
+
+  if (auto Err = validateSubTrie(Root, /*IsRoot=*/true))
+    return Err;
+
+  if (auto Err = visitSubTrie("", Root))
+    return Err;
+
+  SmallVector<SubtrieHandle> Subs;
+  SmallVector<std::string> Prefixes;
+  const size_t NumSlots = Root.getNumSlots();
+  for (size_t I = 0, E = NumSlots; I != E; ++I) {
+    SubtrieSlotValue Slot = Root.load(I);
+    if (!Slot)
+      continue;
+    uint64_t Offset = Slot.isSubtrie() ? Slot.asSubtrie() : Slot.asData();
+    if (Offset >= (uint64_t)Trie.getRegion().size())
+      return createInvalidTrieError(Offset, "slot points out of bound");
+    std::string SubtriePrefix;
+    appendIndexBits(SubtriePrefix, I, NumSlots);
+    if (Slot.isSubtrie()) {
+      SubtrieHandle S(Trie.getRegion(), Slot);
+      Subs.push_back(S);
+      Prefixes.push_back(SubtriePrefix);
+    }
+    if (auto Err = visitSlot(I, Root, SubtriePrefix, Slot))
+      return Err;
+  }
+
+  for (size_t I = 0, E = Subs.size(); I != E; ++I) {
+    Threads.async(
+        [&](unsigned Idx) {
+          // Don't run if there is an error already.
+          if (tooManyErrors())
+            return;
+          if (auto Err = traverseTrieNode(Subs[Idx], Prefixes[Idx]))
+            addError(std::move(Err));
+        },
+        I);
+  }
+
+  Threads.wait();
+  if (Err)
+    return std::move(*Err);
+  return Error::success();
+}
+
+Error TrieVisitor::validateSubTrie(SubtrieHandle Node, bool IsRoot) {
+  char *Addr = reinterpret_cast<char *>(&Node.getHeader());
+  const int64_t Offset = Node.getFileOffset().get();
+  if (Addr + Node.getSize() >=
+      Trie.getRegion().data() + Trie.getRegion().size())
+    return createInvalidTrieError(Offset, "subtrie node spans out of bound");
+
+  if (!IsRoot &&
+      Node.getStartBit() + Node.getNumBits() > Trie.getNumHashBits()) {
+    return createInvalidTrieError(Offset,
+                                  "subtrie represents too many hash bits");
+  }
+
+  if (IsRoot) {
+    if (Node.getStartBit() != 0)
+      return createInvalidTrieError(Offset,
+                                    "root node doesn't start at 0 index");
+
+    return Error::success();
+  }
+
+  if (Node.getNumBits() > Trie.getNumSubtrieBits())
+    return createInvalidTrieError(Offset, "subtrie has wrong number of slots");
+
+  return Error::success();
+}
+
+Error TrieVisitor::traverseTrieNode(SubtrieHandle Node, StringRef Prefix) {
+  if (auto Err = validateSubTrie(Node, /*IsRoot=*/false))
+    return Err;
+
+  if (auto Err = visitSubTrie(Prefix, Node))
+    return Err;
+
+  SmallVector<SubtrieHandle> Subs;
+  SmallVector<std::string> Prefixes;
+  const size_t NumSlots = Node.getNumSlots();
+  for (size_t I = 0, E = NumSlots; I != E; ++I) {
+    SubtrieSlotValue Slot = Node.load(I);
+    if (!Slot)
+      continue;
+    uint64_t Offset = Slot.isSubtrie() ? Slot.asSubtrie() : Slot.asData();
+    if (Offset >= (uint64_t)Trie.getRegion().size())
+      return createInvalidTrieError(Offset, "slot points out of bound");
+    std::string SubtriePrefix = Prefix.str();
+    appendIndexBits(SubtriePrefix, I, NumSlots);
+    if (Slot.isSubtrie()) {
+      SubtrieHandle S(Trie.getRegion(), Slot);
+      Subs.push_back(S);
+      Prefixes.push_back(SubtriePrefix);
+    }
+    if (auto Err = visitSlot(I, Node, SubtriePrefix, Slot))
+      return Err;
+  }
+  for (size_t I = 0, E = Subs.size(); I != E; ++I)
+    if (auto Err = traverseTrieNode(Subs[I], Prefixes[I]))
+      return Err;
+
+  return Error::success();
+}
+
+void TrieRawHashMapHandle::print(
+    raw_ostream &OS, function_ref<void(ArrayRef<char>)> PrintRecordData) const {
+  OS << "hash-num-bits=" << getNumHashBits()
+     << " hash-size=" << getNumHashBytes()
+     << " record-data-size=" << getRecordDataSize() << "\n";
+
+  TriePrinter Printer(*this, OS, PrintRecordData);
+  if (auto Err = Printer.visit())
+    OS << "error: " << toString(std::move(Err)) << "\n";
+
+  if (auto Err = Printer.printRecords())
+    OS << "error: " << toString(std::move(Err)) << "\n";
+
+  return;
+}
+
+Error TrieRawHashMapHandle::validate(
+    function_ref<Error(FileOffset, OnDiskTrieRawHashMap::ConstValueProxy)>
+        RecordVerifier) const {
+  // Use the base TrieVisitor to identify the errors inside trie first.
+  TrieVisitor BasicVerifier(*this);
+  if (auto Err = BasicVerifier.visit())
+    return Err;
+
+  // If the trie data structure is sound, do a second pass to verify data and
+  // verifier function can assume the index is correct. However, there can be
+  // newly added bad entries that can still produce error.
+  TrieVerifier Verifier(*this, RecordVerifier);
+  return Verifier.visit();
+}
+
+#else // !LLVM_ENABLE_ONDISK_CAS
+
+struct OnDiskTrieRawHashMap::ImplType {};
+
+Expected<OnDiskTrieRawHashMap>
+OnDiskTrieRawHashMap::create(const Twine &PathTwine, const Twine &TrieNameTwine,
+                             size_t NumHashBits, uint64_t DataSize,
+                             uint64_t MaxFileSize,
+                             std::optional<uint64_t> NewFileInitialSize,
+                             std::optional<size_t> NewTableNumRootBits,
+                             std::optional<size_t> NewTableNumSubtrieBits) {
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "OnDiskTrieRawHashMap is not supported");
+}
+
+Expected<OnDiskTrieRawHashMap::pointer>
+OnDiskTrieRawHashMap::insertLazy(ArrayRef<uint8_t> Hash,
+                                 LazyInsertOnConstructCB OnConstruct,
+                                 LazyInsertOnLeakCB OnLeak) {
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "OnDiskTrieRawHashMap is not supported");
+}
+
+Expected<OnDiskTrieRawHashMap::const_pointer>
+OnDiskTrieRawHashMap::recoverFromFileOffset(FileOffset Offset) const {
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "OnDiskTrieRawHashMap is not supported");
+}
+
+OnDiskTrieRawHashMap::const_pointer
+OnDiskTrieRawHashMap::find(ArrayRef<uint8_t> Hash) const {
+  return const_pointer();
+}
+
+void OnDiskTrieRawHashMap::print(
+    raw_ostream &OS, function_ref<void(ArrayRef<char>)> PrintRecordData) const {
+}
+
+Error OnDiskTrieRawHashMap::validate(
+    function_ref<Error(FileOffset, OnDiskTrieRawHashMap::ConstValueProxy)>
+        RecordVerifier) const {
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "OnDiskTrieRawHashMap is not supported");
+}
+
+size_t OnDiskTrieRawHashMap::size() const { return 0; }
+size_t OnDiskTrieRawHashMap::capacity() const { return 0; }
+
+#endif // LLVM_ENABLE_ONDISK_CAS
+
+OnDiskTrieRawHashMap::OnDiskTrieRawHashMap(std::unique_ptr<ImplType> Impl)
+    : Impl(std::move(Impl)) {}
+OnDiskTrieRawHashMap::OnDiskTrieRawHashMap(OnDiskTrieRawHashMap &&RHS) =
+    default;
+OnDiskTrieRawHashMap &
+OnDiskTrieRawHashMap::operator=(OnDiskTrieRawHashMap &&RHS) = default;
+OnDiskTrieRawHashMap::~OnDiskTrieRawHashMap() = default;
diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index 0f3ff98..d98d180 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -105,6 +105,8 @@ DebugHandlerBase::~DebugHandlerBase() = default;
 void DebugHandlerBase::beginModule(Module *M) {
   if (M->debug_compile_units().empty())
     Asm = nullptr;
+  else
+    LScopes.initialize(*M);
 }
 
 // Each LexicalScope has first instruction and last instruction to mark
@@ -269,7 +271,7 @@ void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
 
   // Grab the lexical scopes for the function, if we don't have any of those
   // then we're not going to be able to do anything.
-  LScopes.initialize(*MF);
+  LScopes.scanFunction(*MF);
   if (LScopes.empty()) {
     beginFunctionImpl(MF);
     return;
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 67f526f..518121e 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -537,8 +537,9 @@ void DwarfCompileUnit::addWasmRelocBaseGlobal(DIELoc *Loc, StringRef GlobalName,
 // and DW_AT_high_pc attributes. If there are global variables in this
 // scope then create and insert DIEs for these variables.
 DIE &DwarfCompileUnit::updateSubprogramScopeDIE(const DISubprogram *SP,
+                                                const Function &F,
                                                 MCSymbol *LineTableSym) {
-  DIE *SPDie = getOrCreateSubprogramDIE(SP, includeMinimalInlineScopes());
+  DIE *SPDie = getOrCreateSubprogramDIE(SP, &F, includeMinimalInlineScopes());
   SmallVector<RangeSpan, 2> BB_List;
   // If basic block sections are on, ranges for each basic block section has
   // to be emitted separately.
@@ -1122,9 +1123,10 @@ sortLocalVars(SmallVectorImpl<DbgVariable *> &Input) {
 }
 
 DIE &DwarfCompileUnit::constructSubprogramScopeDIE(const DISubprogram *Sub,
+                                                   const Function &F,
                                                    LexicalScope *Scope,
                                                    MCSymbol *LineTableSym) {
-  DIE &ScopeDIE = updateSubprogramScopeDIE(Sub, LineTableSym);
+  DIE &ScopeDIE = updateSubprogramScopeDIE(Sub, F, LineTableSym);
 
   if (Scope) {
     assert(!Scope->getInlinedAt());
@@ -1198,32 +1200,17 @@ DIE *DwarfCompileUnit::createAndAddScopeChildren(LexicalScope *Scope,
   return ObjectPointer;
 }
 
-void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
-    LexicalScope *Scope) {
-  auto *SP = cast<DISubprogram>(Scope->getScopeNode());
-  if (getAbstractScopeDIEs().count(SP))
-    return;
+DIE &DwarfCompileUnit::getOrCreateAbstractSubprogramDIE(
+    const DISubprogram *SP) {
+  if (auto *AbsDef = getAbstractScopeDIEs().lookup(SP))
+    return *AbsDef;
 
-  DIE *ContextDIE;
-  DwarfCompileUnit *ContextCU = this;
-
-  if (includeMinimalInlineScopes())
-    ContextDIE = &getUnitDie();
-  // Some of this is duplicated from DwarfUnit::getOrCreateSubprogramDIE, with
-  // the important distinction that the debug node is not associated with the
-  // DIE (since the debug node will be associated with the concrete DIE, if
-  // any). It could be refactored to some common utility function.
-  else if (auto *SPDecl = SP->getDeclaration()) {
-    ContextDIE = &getUnitDie();
-    getOrCreateSubprogramDIE(SPDecl);
-  } else {
-    ContextDIE = getOrCreateContextDIE(SP->getScope());
-    // The scope may be shared with a subprogram that has already been
-    // constructed in another CU, in which case we need to construct this
-    // subprogram in the same CU.
-    ContextCU = DD->lookupCU(ContextDIE->getUnitDie());
-  }
+  auto [ContextDIE, ContextCU] = getOrCreateAbstractSubprogramContextDIE(SP);
+  return createAbstractSubprogramDIE(SP, ContextDIE, ContextCU);
+}
 
+DIE &DwarfCompileUnit::createAbstractSubprogramDIE(
+    const DISubprogram *SP, DIE *ContextDIE, DwarfCompileUnit *ContextCU) {
   // Passing null as the associated node because the abstract definition
   // shouldn't be found by lookup.
   DIE &AbsDef = ContextCU->createAndAddDIE(dwarf::DW_TAG_subprogram,
@@ -1237,8 +1224,45 @@ void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
                      DD->getDwarfVersion() <= 4 ? std::optional<dwarf::Form>()
                                                 : dwarf::DW_FORM_implicit_const,
                      dwarf::DW_INL_inlined);
-  if (DIE *ObjectPointer = ContextCU->createAndAddScopeChildren(Scope, AbsDef))
-    ContextCU->addDIEEntry(AbsDef, dwarf::DW_AT_object_pointer, *ObjectPointer);
+
+  return AbsDef;
+}
+
+std::pair<DIE *, DwarfCompileUnit *>
+DwarfCompileUnit::getOrCreateAbstractSubprogramContextDIE(
+    const DISubprogram *SP) {
+  bool Minimal = includeMinimalInlineScopes();
+  bool IgnoreScope = shouldPlaceInUnitDIE(SP, Minimal);
+  DIE *ContextDIE = getOrCreateSubprogramContextDIE(SP, IgnoreScope);
+
+  if (auto *SPDecl = SP->getDeclaration())
+    if (!Minimal)
+      getOrCreateSubprogramDIE(SPDecl, nullptr);
+
+  // The scope may be shared with a subprogram that has already been
+  // constructed in another CU, in which case we need to construct this
+  // subprogram in the same CU.
+  auto *ContextCU = IgnoreScope ? this : DD->lookupCU(ContextDIE->getUnitDie());
+
+  return std::make_pair(ContextDIE, ContextCU);
+}
+
+void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
+    LexicalScope *Scope) {
+  auto *SP = cast<DISubprogram>(Scope->getScopeNode());
+
+  // Populate subprogram DIE only once.
+  if (!getFinalizedAbstractSubprograms().insert(SP).second)
+    return;
+
+  auto [ContextDIE, ContextCU] = getOrCreateAbstractSubprogramContextDIE(SP);
+  DIE *AbsDef = getAbstractScopeDIEs().lookup(SP);
+  if (!AbsDef)
+    AbsDef = &createAbstractSubprogramDIE(SP, ContextDIE, ContextCU);
+
+  if (DIE *ObjectPointer = ContextCU->createAndAddScopeChildren(Scope, *AbsDef))
+    ContextCU->addDIEEntry(*AbsDef, dwarf::DW_AT_object_pointer,
+                           *ObjectPointer);
 }
 
 bool DwarfCompileUnit::useGNUAnalogForDwarf5Feature() const {
@@ -1293,9 +1317,9 @@ DwarfCompileUnit::getDwarf5OrGNULocationAtom(dwarf::LocationAtom Loc) const {
 }
 
 DIE &DwarfCompileUnit::constructCallSiteEntryDIE(
-    DIE &ScopeDIE, const DISubprogram *CalleeSP, bool IsTail,
-    const MCSymbol *PCAddr, const MCSymbol *CallAddr, unsigned CallReg,
-    DIType *AllocSiteTy) {
+    DIE &ScopeDIE, const DISubprogram *CalleeSP, const Function *CalleeF,
+    bool IsTail, const MCSymbol *PCAddr, const MCSymbol *CallAddr,
+    unsigned CallReg, DIType *AllocSiteTy) {
   // Insert a call site entry DIE within ScopeDIE.
   DIE &CallSiteDIE = createAndAddDIE(getDwarf5OrGNUTag(dwarf::DW_TAG_call_site),
                                      ScopeDIE, nullptr);
@@ -1305,7 +1329,7 @@ DIE &DwarfCompileUnit::constructCallSiteEntryDIE(
     addAddress(CallSiteDIE, getDwarf5OrGNUAttr(dwarf::DW_AT_call_target),
                MachineLocation(CallReg));
   } else if (CalleeSP) {
-    DIE *CalleeDIE = getOrCreateSubprogramDIE(CalleeSP);
+    DIE *CalleeDIE = getOrCreateSubprogramDIE(CalleeSP, CalleeF);
     assert(CalleeDIE && "Could not create DIE for call site entry origin");
     if (AddLinkageNamesToDeclCallOriginsForTuning(DD) &&
         !CalleeSP->isDefinition() &&
@@ -1396,7 +1420,7 @@ DIE *DwarfCompileUnit::constructImportedEntityDIE(
     if (auto *AbsSPDie = getAbstractScopeDIEs().lookup(SP))
       EntityDie = AbsSPDie;
     else
-      EntityDie = getOrCreateSubprogramDIE(SP);
+      EntityDie = getOrCreateSubprogramDIE(SP, nullptr);
   } else if (auto *T = dyn_cast<DIType>(Entity))
     EntityDie = getOrCreateTypeDIE(T);
   else if (auto *GV = dyn_cast<DIGlobalVariable>(Entity))
@@ -1805,3 +1829,20 @@ DIE *DwarfCompileUnit::getOrCreateContextDIE(const DIScope *Context) {
   }
   return DwarfUnit::getOrCreateContextDIE(Context);
 }
+
+DIE *DwarfCompileUnit::getOrCreateSubprogramDIE(const DISubprogram *SP,
+                                                const Function *F,
+                                                bool Minimal) {
+  if (!F && SP->isDefinition()) {
+    F = DD->getLexicalScopes().getFunction(SP);
+
+    if (!F) {
+      // SP may belong to another CU. Determine the CU similarly
+      // to DwarfDebug::constructAbstractSubprogramScopeDIE.
+      return &DD->getOrCreateAbstractSubprogramCU(SP, *this)
+                  .getOrCreateAbstractSubprogramDIE(SP);
+    }
+  }
+
+  return DwarfUnit::getOrCreateSubprogramDIE(SP, F, Minimal);
+}
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index c2f6ca0..a3bbc83 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -81,6 +81,7 @@ class DwarfCompileUnit final : public DwarfUnit {
 
   // List of abstract local scopes (either DISubprogram or DILexicalBlock).
   DenseMap<const DILocalScope *, DIE *> AbstractLocalScopeDIEs;
+  SmallPtrSet<const DISubprogram *, 8> FinalizedAbstractSubprograms;
 
   // List of inlined lexical block scopes that belong to subprograms within this
   // CU.
@@ -137,12 +138,28 @@ class DwarfCompileUnit final : public DwarfUnit {
     return DU->getAbstractEntities();
   }
 
+  auto &getFinalizedAbstractSubprograms() {
+    if (isDwoUnit() && !DD->shareAcrossDWOCUs())
+      return FinalizedAbstractSubprograms;
+    return DU->getFinalizedAbstractSubprograms();
+  }
+
   void finishNonUnitTypeDIE(DIE& D, const DICompositeType *CTy) override;
 
   /// Add info for Wasm-global-based relocation.
   void addWasmRelocBaseGlobal(DIELoc *Loc, StringRef GlobalName,
                               uint64_t GlobalIndex);
 
+  /// Create context DIE for abstract subprogram.
+  /// \returns The context DIE and the compile unit where abstract
+  ///          DIE should be constructed.
+  std::pair<DIE *, DwarfCompileUnit *>
+  getOrCreateAbstractSubprogramContextDIE(const DISubprogram *SP);
+
+  /// Create new DIE for abstract subprogram.
+  DIE &createAbstractSubprogramDIE(const DISubprogram *SP, DIE *ContextDIE,
+                                   DwarfCompileUnit *ContextCU);
+
 public:
   DwarfCompileUnit(unsigned UID, const DICompileUnit *Node, AsmPrinter *A,
                    DwarfDebug *DW, DwarfFile *DWU,
@@ -216,7 +233,8 @@ public:
   /// DW_AT_low_pc, DW_AT_high_pc and DW_AT_LLVM_stmt_sequence attributes.
   /// If there are global variables in this scope then create and insert DIEs
   /// for these variables.
-  DIE &updateSubprogramScopeDIE(const DISubprogram *SP, MCSymbol *LineTableSym);
+  DIE &updateSubprogramScopeDIE(const DISubprogram *SP, const Function &F,
+                                MCSymbol *LineTableSym);
 
   void constructScopeDIE(LexicalScope *Scope, DIE &ParentScopeDIE);
 
@@ -259,12 +277,18 @@ public:
   /// This instance of 'getOrCreateContextDIE()' can handle DILocalScope.
   DIE *getOrCreateContextDIE(const DIScope *Ty) override;
 
+  DIE *getOrCreateSubprogramDIE(const DISubprogram *SP, const Function *F,
+                                bool Minimal = false) override;
+
   /// Construct a DIE for this subprogram scope.
-  DIE &constructSubprogramScopeDIE(const DISubprogram *Sub, LexicalScope *Scope,
-                                   MCSymbol *LineTableSym);
+  DIE &constructSubprogramScopeDIE(const DISubprogram *Sub, const Function &F,
+                                   LexicalScope *Scope, MCSymbol *LineTableSym);
 
   DIE *createAndAddScopeChildren(LexicalScope *Scope, DIE &ScopeDIE);
 
+  /// Create an abstract subprogram DIE, that should later be populated
+  /// by \ref constructAbstractSubprogramScopeDIE.
+  DIE &getOrCreateAbstractSubprogramDIE(const DISubprogram *SP);
   void constructAbstractSubprogramScopeDIE(LexicalScope *Scope);
 
   /// Whether to use the GNU analog for a DWARF5 tag, attribute, or location
@@ -281,14 +305,15 @@ public:
   dwarf::LocationAtom getDwarf5OrGNULocationAtom(dwarf::LocationAtom Loc) const;
 
   /// Construct a call site entry DIE describing a call within \p Scope to a
-  /// callee described by \p CalleeSP.
+  /// callee described by \p CalleeSP and \p CalleeF.
   /// \p IsTail specifies whether the call is a tail call.
   /// \p PCAddr points to the PC value after the call instruction.
   /// \p CallAddr points to the PC value at the call instruction (or is null).
   /// \p CallReg is a register location for an indirect call. For direct calls
   /// the \p CallReg is set to 0.
   DIE &constructCallSiteEntryDIE(DIE &ScopeDIE, const DISubprogram *CalleeSP,
-                                 bool IsTail, const MCSymbol *PCAddr,
+                                 const Function *CalleeF, bool IsTail,
+                                 const MCSymbol *PCAddr,
                                  const MCSymbol *CallAddr, unsigned CallReg,
                                  DIType *AllocSiteTy);
   /// Construct call site parameter DIEs for the \p CallSiteDIE. The \p Params
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 05d3d18..09d5f9c 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -548,6 +548,16 @@ bool DwarfDebug::shareAcrossDWOCUs() const {
   return SplitDwarfCrossCuReferences;
 }
 
+DwarfCompileUnit &
+DwarfDebug::getOrCreateAbstractSubprogramCU(const DISubprogram *SP,
+                                            DwarfCompileUnit &SrcCU) {
+  auto &CU = getOrCreateDwarfCompileUnit(SP->getUnit());
+  if (CU.getSkeleton())
+    return shareAcrossDWOCUs() ? CU : SrcCU;
+
+  return CU;
+}
+
 void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU,
                                                      LexicalScope *Scope) {
   assert(Scope && Scope->getScopeNode());
@@ -559,14 +569,11 @@ void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU,
   // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
   // was inlined from another compile unit.
   auto &CU = getOrCreateDwarfCompileUnit(SP->getUnit());
-  if (auto *SkelCU = CU.getSkeleton()) {
-    (shareAcrossDWOCUs() ? CU : SrcCU)
-        .constructAbstractSubprogramScopeDIE(Scope);
+  auto &TargetCU = getOrCreateAbstractSubprogramCU(SP, SrcCU);
+  TargetCU.constructAbstractSubprogramScopeDIE(Scope);
+  if (auto *SkelCU = CU.getSkeleton())
     if (CU.getCUNode()->getSplitDebugInlining())
       SkelCU->constructAbstractSubprogramScopeDIE(Scope);
-  } else {
-    CU.constructAbstractSubprogramScopeDIE(Scope);
-  }
 }
 
 /// Represents a parameter whose call site value can be described by applying a
@@ -997,8 +1004,9 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
                                                        ->getName(CallReg)))
                         << (IsTail ? " [IsTail]" : "") << "\n");
 
-      DIE &CallSiteDIE = CU.constructCallSiteEntryDIE(
-          ScopeDIE, CalleeSP, IsTail, PCAddr, CallAddr, CallReg, AllocSiteTy);
+      DIE &CallSiteDIE =
+          CU.constructCallSiteEntryDIE(ScopeDIE, CalleeSP, CalleeDecl, IsTail,
+                                       PCAddr, CallAddr, CallReg, AllocSiteTy);
 
       // Optionally emit call-site-param debug info.
       if (emitDebugEntryValues()) {
@@ -2707,7 +2715,8 @@ void DwarfDebug::skippedNonDebugFunction() {
 
 // Gather and emit post-function debug information.
 void DwarfDebug::endFunctionImpl(const MachineFunction *MF) {
-  const DISubprogram *SP = MF->getFunction().getSubprogram();
+  const Function &F = MF->getFunction();
+  const DISubprogram *SP = F.getSubprogram();
 
   assert(CurFn == MF &&
       "endFunction should be called with the same function as beginFunction");
@@ -2776,11 +2785,12 @@ void DwarfDebug::endFunctionImpl(const MachineFunction *MF) {
 
   ProcessedSPNodes.insert(SP);
   DIE &ScopeDIE =
-      TheCU.constructSubprogramScopeDIE(SP, FnScope, FunctionLineTableLabel);
+      TheCU.constructSubprogramScopeDIE(SP, F, FnScope, FunctionLineTableLabel);
   if (auto *SkelCU = TheCU.getSkeleton())
     if (!LScopes.getAbstractScopesList().empty() &&
         TheCU.getCUNode()->getSplitDebugInlining())
-      SkelCU->constructSubprogramScopeDIE(SP, FnScope, FunctionLineTableLabel);
+      SkelCU->constructSubprogramScopeDIE(SP, F, FnScope,
+                                          FunctionLineTableLabel);
 
   FunctionLineTableLabel = nullptr;
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 89813dc..1a1b28a 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -906,6 +906,10 @@ public:
     return CUDieMap.lookup(Die);
   }
 
+  /// Find the matching DwarfCompileUnit for the given SP referenced from SrcCU.
+  DwarfCompileUnit &getOrCreateAbstractSubprogramCU(const DISubprogram *SP,
+                                                    DwarfCompileUnit &SrcCU);
+
   unsigned getStringTypeLoc(const DIStringType *ST) const {
     return StringTypeLocMap.lookup(ST);
   }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
index 0fc2b91..ef1524d 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -11,6 +11,7 @@
 
 #include "DwarfStringPool.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/DIE.h"
@@ -27,6 +28,7 @@ class DbgVariable;
 class DbgLabel;
 class DINode;
 class DILocalScope;
+class DISubprogram;
 class DwarfCompileUnit;
 class DwarfUnit;
 class LexicalScope;
@@ -94,6 +96,9 @@ class DwarfFile {
   // Collection of abstract subprogram DIEs.
   DenseMap<const DILocalScope *, DIE *> AbstractLocalScopeDIEs;
   DenseMap<const DINode *, std::unique_ptr<DbgEntity>> AbstractEntities;
+  /// Keeps track of abstract subprograms to populate them only once.
+  // FIXME: merge creation and population of abstract scopes.
+  SmallPtrSet<const DISubprogram *, 8> FinalizedAbstractSubprograms;
 
   /// Maps MDNodes for type system with the corresponding DIEs. These DIEs can
   /// be shared across CUs, that is why we keep the map here instead
@@ -174,6 +179,10 @@ public:
     return AbstractEntities;
   }
 
+  auto &getFinalizedAbstractSubprograms() {
+    return FinalizedAbstractSubprograms;
+  }
+
   void insertDIE(const MDNode *TypeMD, DIE *Die) {
     DITypeNodeToDieMap.insert(std::make_pair(TypeMD, Die));
   }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index d76fd0c..62fb5eb 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -573,7 +573,7 @@ DIE *DwarfUnit::getOrCreateContextDIE(const DIScope *Context) {
   if (auto *NS = dyn_cast<DINamespace>(Context))
     return getOrCreateNameSpace(NS);
   if (auto *SP = dyn_cast<DISubprogram>(Context))
-    return getOrCreateSubprogramDIE(SP);
+    return getOrCreateSubprogramDIE(SP, nullptr);
   if (auto *M = dyn_cast<DIModule>(Context))
     return getOrCreateModule(M);
   return getDIE(Context);
@@ -1066,7 +1066,7 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
       if (!Element)
         continue;
       if (auto *SP = dyn_cast<DISubprogram>(Element))
-        getOrCreateSubprogramDIE(SP);
+        getOrCreateSubprogramDIE(SP, nullptr);
       else if (auto *DDTy = dyn_cast<DIDerivedType>(Element)) {
         if (DDTy->getTag() == dwarf::DW_TAG_friend) {
           DIE &ElemDie = createAndAddDIE(dwarf::DW_TAG_friend, Buffer);
@@ -1335,22 +1335,21 @@ DIE *DwarfUnit::getOrCreateModule(const DIModule *M) {
   return &MDie;
 }
 
-DIE *DwarfUnit::getOrCreateSubprogramDIE(const DISubprogram *SP, bool Minimal) {
+DIE *DwarfUnit::getOrCreateSubprogramDIE(const DISubprogram *SP,
+                                         const Function *FnHint, bool Minimal) {
   // Construct the context before querying for the existence of the DIE in case
   // such construction creates the DIE (as is the case for member function
   // declarations).
   DIE *ContextDIE =
-      Minimal ? &getUnitDie() : getOrCreateContextDIE(SP->getScope());
+      getOrCreateSubprogramContextDIE(SP, shouldPlaceInUnitDIE(SP, Minimal));
 
   if (DIE *SPDie = getDIE(SP))
     return SPDie;
 
   if (auto *SPDecl = SP->getDeclaration()) {
     if (!Minimal) {
-      // Add subprogram definitions to the CU die directly.
-      ContextDIE = &getUnitDie();
       // Build the decl now to ensure it precedes the definition.
-      getOrCreateSubprogramDIE(SPDecl);
+      getOrCreateSubprogramDIE(SPDecl, nullptr);
       // Check whether the DIE for SP has already been created after the call
       // above.
       // FIXME: Should the creation of definition subprogram DIE during
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
index fe05766..bb00ec3 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -256,7 +256,9 @@ public:
 
   DIE *getOrCreateNameSpace(const DINamespace *NS);
   DIE *getOrCreateModule(const DIModule *M);
-  DIE *getOrCreateSubprogramDIE(const DISubprogram *SP, bool Minimal = false);
+  virtual DIE *getOrCreateSubprogramDIE(const DISubprogram *SP,
+                                        const Function *FnHint,
+                                        bool Minimal = false);
 
   void applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
                                  bool SkipSPAttributes = false);
@@ -343,6 +345,18 @@ protected:
   /// Emit the common part of the header for this unit.
   void emitCommonHeader(bool UseOffsets, dwarf::UnitType UT);
 
+  bool shouldPlaceInUnitDIE(const DISubprogram *SP, bool Minimal) {
+    // Add subprogram declarations to the CU die directly.
+    return Minimal || SP->getDeclaration();
+  }
+
+  DIE *getOrCreateSubprogramContextDIE(const DISubprogram *SP,
+                                       bool IgnoreScope) {
+    if (IgnoreScope)
+      return &getUnitDie();
+    return getOrCreateContextDIE(SP->getScope());
+  }
+
 private:
   /// A helper to add a wide integer constant to a DIE using a block
   /// form.
diff --git a/llvm/lib/CodeGen/LexicalScopes.cpp b/llvm/lib/CodeGen/LexicalScopes.cpp
index 5916f61..9fc9ac9 100644
--- a/llvm/lib/CodeGen/LexicalScopes.cpp
+++ b/llvm/lib/CodeGen/LexicalScopes.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
@@ -36,8 +37,16 @@ using namespace llvm;
 
 #define DEBUG_TYPE "lexicalscopes"
 
-/// reset - Reset the instance so that it's prepared for another function.
-void LexicalScopes::reset() {
+static bool skipUnit(const DICompileUnit *CU) {
+  return CU->getEmissionKind() == DICompileUnit::NoDebug;
+}
+
+void LexicalScopes::resetModule() {
+  FunctionMap.clear();
+  resetFunction();
+}
+
+void LexicalScopes::resetFunction() {
   MF = nullptr;
   CurrentFnLexicalScope = nullptr;
   LexicalScopeMap.clear();
@@ -47,12 +56,19 @@ void LexicalScopes::reset() {
   DominatedBlocks.clear();
 }
 
-/// initialize - Scan machine function and constuct lexical scope nest.
-void LexicalScopes::initialize(const MachineFunction &Fn) {
-  reset();
+void LexicalScopes::initialize(const Module &M) {
+  resetModule();
+  for (const Function &F : M) {
+    DISubprogram *SP = F.getSubprogram();
+    if (SP && (!SP->getUnit() || !skipUnit(SP->getUnit())))
+      FunctionMap[SP] = &F;
+  }
+}
+
+void LexicalScopes::scanFunction(const MachineFunction &Fn) {
+  resetFunction();
   // Don't attempt any lexical scope creation for a NoDebug compile unit.
-  if (Fn.getFunction().getSubprogram()->getUnit()->getEmissionKind() ==
-      DICompileUnit::NoDebug)
+  if (skipUnit(Fn.getFunction().getSubprogram()->getUnit()))
     return;
   MF = &Fn;
   SmallVector<InsnRange, 4> MIRanges;
@@ -143,8 +159,7 @@ LexicalScope *LexicalScopes::getOrCreateLexicalScope(const DILocalScope *Scope,
                                                      const DILocation *IA) {
   if (IA) {
     // Skip scopes inlined from a NoDebug compile unit.
-    if (Scope->getSubprogram()->getUnit()->getEmissionKind() ==
-        DICompileUnit::NoDebug)
+    if (skipUnit(Scope->getSubprogram()->getUnit()))
       return getOrCreateLexicalScope(IA);
     // Create an abstract scope for inlined function.
     getOrCreateAbstractScope(Scope);
diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
index a8143bd..0037bdd 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
@@ -3721,7 +3721,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
   TFI = MF.getSubtarget().getFrameLowering();
   TFI->getCalleeSaves(MF, CalleeSavedRegs);
   MFI = &MF.getFrameInfo();
-  LS.initialize(MF);
+  LS.scanFunction(MF);
 
   const auto &STI = MF.getSubtarget();
   AdjustsStackInCalls = MFI->adjustsStack() &&
diff --git a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
index 82e0c28..b9ea03f 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
@@ -2231,7 +2231,7 @@ bool VarLocBasedLDV::ExtendRanges(MachineFunction &MF,
   TFI->getCalleeSaves(MF, CalleeSavedRegs);
   this->ShouldEmitDebugEntryValues = ShouldEmitDebugEntryValues;
 
-  LS.initialize(MF);
+  LS.scanFunction(MF);
 
   bool Changed = false;
   bool OLChanged = false;
diff --git a/llvm/lib/CodeGen/LiveDebugVariables.cpp b/llvm/lib/CodeGen/LiveDebugVariables.cpp
index 84e9f03..001ba52 100644
--- a/llvm/lib/CodeGen/LiveDebugVariables.cpp
+++ b/llvm/lib/CodeGen/LiveDebugVariables.cpp
@@ -1263,7 +1263,7 @@ void UserValue::computeIntervals(MachineRegisterInfo &MRI,
 
 void LiveDebugVariables::LDVImpl::computeIntervals() {
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
 
   for (const auto &UV : userValues) {
     UV->computeIntervals(MF->getRegInfo(), *TRI, *LIS, LS);
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index e911ce8..1154855 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -1549,7 +1549,7 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
       report("G_BUILD_VECTOR result element type must match source type", MI);
 
     if (DstTy.getNumElements() != MI->getNumOperands() - 1)
-      report("G_BUILD_VECTOR must have an operand for each elemement", MI);
+      report("G_BUILD_VECTOR must have an operand for each element", MI);
 
     for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 2))
       if (MRI->getType(MI->getOperand(1).getReg()) != MRI->getType(MO.getReg()))
@@ -2398,11 +2398,11 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
 
     // The next two checks allow COPY between physical and virtual registers,
     // when the virtual register has a scalable size and the physical register
-    // has a fixed size. These checks allow COPY between *potentialy* mismatched
-    // sizes. However, once RegisterBankSelection occurs, MachineVerifier should
-    // be able to resolve a fixed size for the scalable vector, and at that
-    // point this function will know for sure whether the sizes are mismatched
-    // and correctly report a size mismatch.
+    // has a fixed size. These checks allow COPY between *potentially*
+    // mismatched sizes. However, once RegisterBankSelection occurs,
+    // MachineVerifier should be able to resolve a fixed size for the scalable
+    // vector, and at that point this function will know for sure whether the
+    // sizes are mismatched and correctly report a size mismatch.
     if (SrcReg.isPhysical() && DstReg.isVirtual() && DstSize.isScalable() &&
         !SrcSize.isScalable())
       break;
@@ -3213,13 +3213,13 @@ struct VRegFilter {
 
 private:
   static constexpr unsigned SparseUniverseMax = 10 * 1024 * 8;
-  // VRegs indexed within SparseUniverseMax are tracked by Sparse, those beyound
-  // are tracked by Dense. The only purpose of the threashold and the Dense set
+  // VRegs indexed within SparseUniverseMax are tracked by Sparse, those beyond
+  // are tracked by Dense. The only purpose of the threshold and the Dense set
   // is to have a reasonably growing memory usage in pathological cases (large
   // number of very sparse VRegFilter instances live at the same time). In
   // practice even in the worst-by-execution time cases having all elements
   // tracked by Sparse (very large SparseUniverseMax scenario) tends to be more
-  // space efficient than if tracked by Dense. The threashold is set to keep the
+  // space efficient than if tracked by Dense. The threshold is set to keep the
   // worst-case memory usage within 2x of figures determined empirically for
   // "all Dense" scenario in such worst-by-execution-time cases.
   BitVector Sparse;
@@ -3459,7 +3459,7 @@ void MachineVerifier::visitMachineFunctionAfter() {
 
   // Check live-in list of each MBB. If a register is live into MBB, check
   // that the register is in regsLiveOut of each predecessor block. Since
-  // this must come from a definition in the predecesssor or its live-in
+  // this must come from a definition in the predecessor or its live-in
   // list, this will catch a live-through case where the predecessor does not
   // have the register in its live-in list.  This currently only checks
   // registers that have no aliases, are not allocatable and are not
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c815686..77df4b4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17983,8 +17983,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
 
   // (fsub A, 0) -> A
   if (N1CFP && N1CFP->isZero()) {
-    if (!N1CFP->isNegative() || Options.NoSignedZerosFPMath ||
-        Flags.hasNoSignedZeros()) {
+    if (!N1CFP->isNegative() || Flags.hasNoSignedZeros()) {
       return N0;
     }
   }
@@ -17997,8 +17996,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
 
   // (fsub -0.0, N1) -> -N1
   if (N0CFP && N0CFP->isZero()) {
-    if (N0CFP->isNegative() ||
-        (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())) {
+    if (N0CFP->isNegative() || Flags.hasNoSignedZeros()) {
       // We cannot replace an FSUB(+-0.0,X) with FNEG(X) when denormals are
       // flushed to zero, unless all users treat denorms as zero (DAZ).
       // FIXME: This transform will change the sign of a NaN and the behavior
@@ -18014,8 +18012,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
     }
   }
 
-  if ((Options.NoSignedZerosFPMath ||
-       (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) &&
+  if (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros() &&
       N1.getOpcode() == ISD::FADD) {
     // X - (X + Y) -> -Y
     if (N0 == N1->getOperand(0))
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index daebf447..dd83168 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -2847,6 +2847,7 @@ unsigned CastInst::isEliminableCastPair(
   // FPTRUNC       >       FloatPt      n/a        FloatPt      n/a
   // FPEXT         <       FloatPt      n/a        FloatPt      n/a
   // PTRTOINT     n/a      Pointer      n/a        Integral   Unsigned
+  // PTRTOADDR    n/a      Pointer      n/a        Integral   Unsigned
   // INTTOPTR     n/a      Integral   Unsigned     Pointer      n/a
   // BITCAST       =       FirstClass   n/a       FirstClass    n/a
   // ADDRSPCST    n/a      Pointer      n/a        Pointer      n/a
@@ -2878,7 +2879,7 @@ unsigned CastInst::isEliminableCastPair(
     { 99,99,99, 2, 2,99,99, 8, 2,99,99,99, 4, 0}, // FPExt          |
     {  1, 0, 0,99,99, 0, 0,99,99,99,99, 7, 3, 0}, // PtrToInt       |
     {  1, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // PtrToAddr      |
-    { 99,99,99,99,99,99,99,99,99,11,99,99,15, 0}, // IntToPtr       |
+    { 99,99,99,99,99,99,99,99,99,11,11,99,15, 0}, // IntToPtr       |
     {  5, 5, 5, 0, 0, 5, 5, 0, 0,16,16, 5, 1,14}, // BitCast        |
     {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+
   };
@@ -2972,7 +2973,8 @@ unsigned CastInst::isEliminableCastPair(
       // zext, sext -> zext, because sext can't sign extend after zext
       return Instruction::ZExt;
     case 11: {
-      // inttoptr, ptrtoint -> bitcast if SrcSize<=PtrSize and SrcSize==DstSize
+      // inttoptr, ptrtoint/ptrtoaddr -> bitcast if SrcSize<=PtrSize and
+      // SrcSize==DstSize
       if (!MidIntPtrTy)
         return 0;
       unsigned PtrSize = MidIntPtrTy->getScalarSizeInBits();
diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp
index 5827292..99029c1 100644
--- a/llvm/lib/IR/ProfDataUtils.cpp
+++ b/llvm/lib/IR/ProfDataUtils.cpp
@@ -252,6 +252,13 @@ void setExplicitlyUnknownBranchWeights(Instruction &I, StringRef PassName) {
                    MDB.createString(PassName)}));
 }
 
+void setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I, Function &F,
+                                                 StringRef PassName) {
+  if (std::optional<Function::ProfileCount> EC = F.getEntryCount();
+      EC && EC->getCount() > 0)
+    setExplicitlyUnknownBranchWeights(I, PassName);
+}
+
 void setExplicitlyUnknownFunctionEntryCount(Function &F, StringRef PassName) {
   MDBuilder MDB(F.getContext());
   F.setMetadata(
diff --git a/llvm/lib/Object/OffloadBundle.cpp b/llvm/lib/Object/OffloadBundle.cpp
index 1e1042c..0dd378e 100644
--- a/llvm/lib/Object/OffloadBundle.cpp
+++ b/llvm/lib/Object/OffloadBundle.cpp
@@ -89,17 +89,17 @@ Error OffloadBundleFatBin::readEntries(StringRef Buffer,
     uint64_t EntryIDSize;
     StringRef EntryID;
 
-    if (auto EC = Reader.readInteger(EntryOffset))
-      return errorCodeToError(object_error::parse_failed);
+    if (Error Err = Reader.readInteger(EntryOffset))
+      return Err;
 
-    if (auto EC = Reader.readInteger(EntrySize))
-      return errorCodeToError(object_error::parse_failed);
+    if (Error Err = Reader.readInteger(EntrySize))
+      return Err;
 
-    if (auto EC = Reader.readInteger(EntryIDSize))
-      return errorCodeToError(object_error::parse_failed);
+    if (Error Err = Reader.readInteger(EntryIDSize))
+      return Err;
 
-    if (auto EC = Reader.readFixedString(EntryID, EntryIDSize))
-      return errorCodeToError(object_error::parse_failed);
+    if (Error Err = Reader.readFixedString(EntryID, EntryIDSize))
+      return Err;
 
     auto Entry = std::make_unique<OffloadBundleEntry>(
         EntryOffset + SectionOffset, EntrySize, EntryIDSize, EntryID);
@@ -125,7 +125,7 @@ OffloadBundleFatBin::create(MemoryBufferRef Buf, uint64_t SectionOffset,
   // Read the Bundle Entries
   Error Err = TheBundle->readEntries(Buf.getBuffer(), SectionOffset);
   if (Err)
-    return errorCodeToError(object_error::parse_failed);
+    return Err;
 
   return std::unique_ptr<OffloadBundleFatBin>(TheBundle);
 }
diff --git a/llvm/lib/Support/FileCollector.cpp b/llvm/lib/Support/FileCollector.cpp
index 5dc224a..1e5de2c 100644
--- a/llvm/lib/Support/FileCollector.cpp
+++ b/llvm/lib/Support/FileCollector.cpp
@@ -68,9 +68,8 @@ void FileCollector::PathCanonicalizer::updateWithRealPath(
   SmallString<256> RealPath;
   auto DirWithSymlink = CachedDirs.find(Directory);
   if (DirWithSymlink == CachedDirs.end()) {
-    // FIXME: Should this be a call to FileSystem::getRealpath(), in some
-    // cases? What if there is nothing on disk?
-    if (sys::fs::real_path(Directory, RealPath))
+    // FIXME: What if there is nothing on disk?
+    if (VFS->getRealPath(Directory, RealPath))
       return;
     CachedDirs[Directory] = std::string(RealPath);
   } else {
diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 686688a..8da6fdb 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -282,18 +282,15 @@ void stripTokenAhead(SmallVectorImpl<Token> &Tokens, size_t Idx) {
 // For example:
 //  The template string
 //  " \t{{#section}}A{{/section}}"
-// would be considered as having no text ahead and would be render as
+// would be considered as having no text ahead and would be render as:
 //  "A"
-// The exception for this is partial tag which requires us to
-// keep track of the indentation once it's rendered.
 void stripTokenBefore(SmallVectorImpl<Token> &Tokens, size_t Idx,
                       Token &CurrentToken, Token::Type CurrentType) {
   Token &PrevToken = Tokens[Idx - 1];
   StringRef PrevTokenBody = PrevToken.TokenBody;
   StringRef Unindented = PrevTokenBody.rtrim(" \r\t\v");
   size_t Indentation = PrevTokenBody.size() - Unindented.size();
-  if (CurrentType != Token::Type::Partial)
-    PrevToken.TokenBody = Unindented.str();
+  PrevToken.TokenBody = Unindented.str();
   CurrentToken.setIndentation(Indentation);
 }
 
@@ -439,7 +436,8 @@ class AddIndentationStringStream : public raw_ostream {
 public:
   explicit AddIndentationStringStream(llvm::raw_ostream &WrappedStream,
                                       size_t Indentation)
-      : Indentation(Indentation), WrappedStream(WrappedStream) {
+      : Indentation(Indentation), WrappedStream(WrappedStream),
+        NeedsIndent(true) {
     SetUnbuffered();
   }
 
@@ -448,10 +446,15 @@ protected:
     llvm::StringRef Data(Ptr, Size);
     SmallString<0> Indent;
     Indent.resize(Indentation, ' ');
+
     for (char C : Data) {
+      if (NeedsIndent && C != '\n') {
+        WrappedStream << Indent;
+        NeedsIndent = false;
+      }
       WrappedStream << C;
       if (C == '\n')
-        WrappedStream << Indent;
+        NeedsIndent = true;
     }
   }
 
@@ -460,6 +463,7 @@ protected:
 private:
   size_t Indentation;
   llvm::raw_ostream &WrappedStream;
+  bool NeedsIndent;
 };
 
 class Parser {
diff --git a/llvm/lib/Support/SipHash.cpp b/llvm/lib/Support/SipHash.cpp
index 86dad66..382d36f 100644
--- a/llvm/lib/Support/SipHash.cpp
+++ b/llvm/lib/Support/SipHash.cpp
@@ -35,14 +35,19 @@ void llvm::getSipHash_2_4_128(ArrayRef<uint8_t> In, const uint8_t (&K)[16],
   siphash<2, 4>(In.data(), In.size(), K, Out);
 }
 
-/// Compute an ABI-stable 16-bit hash of the given string.
-uint16_t llvm::getPointerAuthStableSipHash(StringRef Str) {
+/// Compute an ABI-stable 64-bit hash of the given string.
+uint64_t llvm::getStableSipHash(StringRef Str) {
   static const uint8_t K[16] = {0xb5, 0xd4, 0xc9, 0xeb, 0x79, 0x10, 0x4a, 0x79,
                                 0x6f, 0xec, 0x8b, 0x1b, 0x42, 0x87, 0x81, 0xd4};
 
   uint8_t RawHashBytes[8];
   getSipHash_2_4_64(arrayRefFromStringRef(Str), K, RawHashBytes);
-  uint64_t RawHash = endian::read64le(RawHashBytes);
+  return endian::read64le(RawHashBytes);
+}
+
+/// Compute an ABI-stable 16-bit hash of the given string.
+uint16_t llvm::getPointerAuthStableSipHash(StringRef Str) {
+  uint64_t RawHash = getStableSipHash(Str);
 
   // Produce a non-zero 16-bit discriminator.
   uint16_t Discriminator = (RawHash % 0xFFFF) + 1;
diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp
index cf78459..7ff62d4 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -1973,7 +1973,7 @@ private:
           EC = FS->makeAbsolute(FullPath, Name);
           Name = canonicalize(Name);
         } else {
-          EC = sys::fs::make_absolute(Name);
+          EC = FS->makeAbsolute(Name);
         }
         if (EC) {
           assert(NameValueNode && "Name presence should be checked earlier");
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 38718c4..7504f1a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -150,7 +150,10 @@ public:
       if (!CVisited.insert(CII).second)
         continue;
 
-      if (CII->getParent() == II->getParent() && !IsLookThru(II))
+      // Same-BB filter must look at the *user*; and allow non-lookthrough
+      // users when the def is a PHI (loop-header pattern).
+      if (CII->getParent() == II->getParent() && !IsLookThru(CII) &&
+          !isa<PHINode>(II))
         continue;
 
       if (isOpLegal(CII))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index d9bfeae..0a59132 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -2562,7 +2562,9 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
   for (Function *F : NeedsPostProcess)
     Splitter.processFunction(*F);
   for (Function *F : Intrinsics) {
-    if (isRemovablePointerIntrinsic(F->getIntrinsicID())) {
+    // use_empty() can also occur with cases like masked load, which will
+    // have been rewritten out of the module by now but not erased.
+    if (F->use_empty() || isRemovablePointerIntrinsic(F->getIntrinsicID())) {
       F->eraseFromParent();
     } else {
       std::optional<Function *> NewF = Intrinsic::remangleIntrinsicFunction(F);
diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index cb57c43..d4d9e54 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -193,7 +193,7 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,
   // we need to invert the branch condition to jump over TrueBB when the
   // condition is false.
   auto CC = static_cast<RISCVCC::CondCode>(MI.getOperand(3).getImm());
-  CC = RISCVCC::getOppositeBranchCondition(CC);
+  CC = RISCVCC::getInverseBranchCondition(CC);
 
   // Insert branch instruction.
   BuildMI(MBB, MBBI, DL, TII->get(RISCVCC::getBrCond(CC)))
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 56db09a..6d418fd 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1134,7 +1134,7 @@ unsigned RISCVCC::getBrCond(RISCVCC::CondCode CC, unsigned SelectOpc) {
   }
 }
 
-RISCVCC::CondCode RISCVCC::getOppositeBranchCondition(RISCVCC::CondCode CC) {
+RISCVCC::CondCode RISCVCC::getInverseBranchCondition(RISCVCC::CondCode CC) {
   switch (CC) {
   default:
     llvm_unreachable("Unrecognized conditional branch");
@@ -1554,7 +1554,7 @@ bool RISCVInstrInfo::optimizeCondBranch(MachineInstr &MI) const {
     return Register();
   };
 
-  unsigned NewOpc = RISCVCC::getBrCond(getOppositeBranchCondition(CC));
+  unsigned NewOpc = RISCVCC::getBrCond(getInverseBranchCondition(CC));
 
   // Might be case 1.
   // Don't change 0 to 1 since we can use x0.
@@ -1801,7 +1801,7 @@ RISCVInstrInfo::optimizeSelect(MachineInstr &MI,
   // Add condition code, inverting if necessary.
   auto CC = static_cast<RISCVCC::CondCode>(MI.getOperand(3).getImm());
   if (Invert)
-    CC = RISCVCC::getOppositeBranchCondition(CC);
+    CC = RISCVCC::getInverseBranchCondition(CC);
   NewMI.addImm(CC);
 
   // Copy the false register.
@@ -3978,7 +3978,7 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI,
   case RISCV::PseudoCCMOVGPR: {
     // CCMOV can be commuted by inverting the condition.
     auto CC = static_cast<RISCVCC::CondCode>(MI.getOperand(3).getImm());
-    CC = RISCVCC::getOppositeBranchCondition(CC);
+    CC = RISCVCC::getInverseBranchCondition(CC);
     auto &WorkingMI = cloneIfNew(MI);
     WorkingMI.getOperand(3).setImm(CC);
     return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI*/ false,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 2bc499b..42a0c4c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -44,7 +44,7 @@ enum CondCode {
   COND_INVALID
 };
 
-CondCode getOppositeBranchCondition(CondCode);
+CondCode getInverseBranchCondition(CondCode);
 unsigned getBrCond(CondCode CC, unsigned SelectOpc = 0);
 
 } // end of namespace RISCVCC
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 563f3bb..d4124ae 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -167,6 +167,42 @@ static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
   return false;
 }
 
+// If this is i64 AND is part of (X & -(1 << C1) & 0xffffffff) == C2 << C1),
+// DAGCombiner can convert this to (sraiw X, C1) == sext(C2) for RV64. On RV32,
+// the type will be split so only the lower 32 bits need to be compared using
+// (srai/srli X, C) == C2.
+static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm) {
+  if (!Inst->hasOneUse())
+    return false;
+
+  // Look for equality comparison.
+  auto *Cmp = dyn_cast<ICmpInst>(*Inst->user_begin());
+  if (!Cmp || !Cmp->isEquality())
+    return false;
+
+  // Right hand side of comparison should be a constant.
+  auto *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
+  if (!C)
+    return false;
+
+  uint64_t Mask = Imm.getZExtValue();
+
+  // Mask should be of the form -(1 << C) in the lower 32 bits.
+  if (!isUInt<32>(Mask) || !isPowerOf2_32(-uint32_t(Mask)))
+    return false;
+
+  // Comparison constant should be a subset of Mask.
+  uint64_t CmpC = C->getZExtValue();
+  if ((CmpC & Mask) != CmpC)
+    return false;
+
+  // We'll need to sign extend the comparison constant and shift it right. Make
+  // sure the new constant can use addi/xori+seqz/snez.
+  unsigned ShiftBits = llvm::countr_zero(Mask);
+  int64_t NewCmpC = SignExtend64<32>(CmpC) >> ShiftBits;
+  return NewCmpC >= -2048 && NewCmpC <= 2048;
+}
+
 InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
                                                 const APInt &Imm, Type *Ty,
                                                 TTI::TargetCostKind CostKind,
@@ -224,6 +260,9 @@ InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
     if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
         canUseShiftPair(Inst, Imm))
       return TTI::TCC_Free;
+    if (Inst && Idx == 1 && Imm.getBitWidth() == 64 &&
+        canUseShiftCmp(Inst, Imm))
+      return TTI::TCC_Free;
     Takes12BitImm = true;
     break;
   case Instruction::Add:
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index b4fc8da..db85e33 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -587,7 +587,8 @@ bool SPIRVLegalizerInfo::legalizeIsFPClass(
   }
 
   if (FPClassTest PartialCheck = Mask & fcNan) {
-    auto InfWithQnanBitC = buildSPIRVConstant(IntTy, Inf | QNaNBitMask);
+    auto InfWithQnanBitC =
+        buildSPIRVConstant(IntTy, std::move(Inf) | QNaNBitMask);
     if (PartialCheck == fcNan) {
       // isnan(V) ==> abs(V) u> int(inf)
       appendToRes(
@@ -613,7 +614,7 @@ bool SPIRVLegalizerInfo::legalizeIsFPClass(
     APInt ExpLSB = ExpMask & ~(ExpMask.shl(1));
     auto ExpMinusOne = assignSPIRVTy(
         MIRBuilder.buildSub(IntTy, Abs, buildSPIRVConstant(IntTy, ExpLSB)));
-    APInt MaxExpMinusOne = ExpMask - ExpLSB;
+    APInt MaxExpMinusOne = std::move(ExpMask) - ExpLSB;
     auto NormalRes = assignSPIRVTy(
         MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, ExpMinusOne,
                              buildSPIRVConstant(IntTy, MaxExpMinusOne)));
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e7eb67a..cd04ff5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31215,16 +31215,16 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
     unsigned NumElts = VT.getVectorNumElements();
 
     if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
-      if (IsFSHR)
-        std::swap(Op0, Op1);
 
       if (IsCstSplat) {
+        if (IsFSHR)
+          std::swap(Op0, Op1);
         uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
         SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
         return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
                              {Op0, Op1, Imm}, DAG, Subtarget);
       }
-      return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
+      return getAVX512Node(IsFSHR ? ISD::FSHR : ISD::FSHL, DL, VT,
                            {Op0, Op1, Amt}, DAG, Subtarget);
     }
     assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
@@ -35139,8 +35139,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(VALIGN)
   NODE_NAME_CASE(VSHLD)
   NODE_NAME_CASE(VSHRD)
-  NODE_NAME_CASE(VSHLDV)
-  NODE_NAME_CASE(VSHRDV)
   NODE_NAME_CASE(PSHUFD)
   NODE_NAME_CASE(PSHUFHW)
   NODE_NAME_CASE(PSHUFLW)
@@ -45171,6 +45169,7 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
   case X86ISD::Wrapper:
   case X86ISD::WrapperRIP:
     return true;
+  case X86ISD::INSERTPS:
   case X86ISD::BLENDI:
   case X86ISD::PSHUFB:
   case X86ISD::PSHUFD:
@@ -45241,6 +45240,7 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
   case X86ISD::BLENDV:
     return false;
   // SSE target shuffles.
+  case X86ISD::INSERTPS:
   case X86ISD::PSHUFB:
   case X86ISD::PSHUFD:
   case X86ISD::UNPCKL:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 8ab8c66..b55556a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -471,8 +471,7 @@ namespace llvm {
     // VBMI2 Concat & Shift.
     VSHLD,
     VSHRD,
-    VSHLDV,
-    VSHRDV,
+
     // Shuffle Packed Values at 128-bit granularity.
     SHUF128,
     MOVDDUP,
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 2371ed4..564810c 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -12300,72 +12300,76 @@ defm : vpclmulqdq_aliases<"VPCLMULQDQZ256", VR256X, i256mem>;
 // VBMI2
 //===----------------------------------------------------------------------===//
 
-multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode,
+multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode, bit SwapLR,
                               X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
   let Constraints = "$src1 = $dst",
       ExeDomain   = VTI.ExeDomain in {
     defm r:   AVX512_maskable_3src<Op, MRMSrcReg, VTI, (outs VTI.RC:$dst),
                 (ins VTI.RC:$src2, VTI.RC:$src3), OpStr,
                 "$src3, $src2", "$src2, $src3",
-                (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, VTI.RC:$src3))>,
+                !if(SwapLR,
+                (VTI.VT (OpNode (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src3))),
+                (VTI.VT (OpNode (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src3))))>,
                 T8, PD, EVEX, VVVV, Sched<[sched]>;
     defm m:   AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
                 (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
                 "$src3, $src2", "$src2, $src3",
-                (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
-                        (VTI.VT (VTI.LdFrag addr:$src3))))>,
+                !if(SwapLR,
+                (VTI.VT (OpNode (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src1), (VTI.VT (VTI.LdFrag addr:$src3)))),
+                (VTI.VT (OpNode (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src2), (VTI.VT (VTI.LdFrag addr:$src3)))))>,
                 T8, PD, EVEX, VVVV,
                 Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 
-multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode,
+multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode, bit SwapLR,
                                X86FoldableSchedWrite sched, X86VectorVTInfo VTI>
-         : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched, VTI> {
+         : VBMI2_shift_var_rm<Op, OpStr, OpNode, SwapLR, sched, VTI> {
   let Constraints = "$src1 = $dst",
       ExeDomain   = VTI.ExeDomain in
   defm mb:  AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
               (ins VTI.RC:$src2, VTI.ScalarMemOp:$src3), OpStr,
               "${src3}"#VTI.BroadcastStr#", $src2",
               "$src2, ${src3}"#VTI.BroadcastStr,
-              (OpNode VTI.RC:$src1, VTI.RC:$src2,
-               (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
+              !if(SwapLR,
+              (OpNode (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src1), (VTI.VT (VTI.BroadcastLdFrag addr:$src3))),
+              (OpNode (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src2), (VTI.VT (VTI.BroadcastLdFrag addr:$src3))))>,
               T8, PD, EVEX, VVVV, EVEX_B,
               Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
-multiclass VBMI2_shift_var_rm_common<bits<8> Op, string OpStr, SDNode OpNode,
+multiclass VBMI2_shift_var_rm_common<bits<8> Op, string OpStr, SDNode OpNode, bit SwapLR,
                                      X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTI> {
   let Predicates = [HasVBMI2] in
-  defm Z      : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.ZMM, VTI.info512>,
+  defm Z      : VBMI2_shift_var_rm<Op, OpStr, OpNode, SwapLR, sched.ZMM, VTI.info512>,
                                    EVEX_V512;
   let Predicates = [HasVBMI2, HasVLX] in {
-    defm Z256 : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.YMM, VTI.info256>,
+    defm Z256 : VBMI2_shift_var_rm<Op, OpStr, OpNode, SwapLR, sched.YMM, VTI.info256>,
                                    EVEX_V256;
-    defm Z128 : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.XMM, VTI.info128>,
+    defm Z128 : VBMI2_shift_var_rm<Op, OpStr, OpNode, SwapLR, sched.XMM, VTI.info128>,
                                    EVEX_V128;
   }
 }
 
-multiclass VBMI2_shift_var_rmb_common<bits<8> Op, string OpStr, SDNode OpNode,
+multiclass VBMI2_shift_var_rmb_common<bits<8> Op, string OpStr, SDNode OpNode, bit SwapLR,
                                       X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTI> {
   let Predicates = [HasVBMI2] in
-  defm Z      : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.ZMM, VTI.info512>,
+  defm Z      : VBMI2_shift_var_rmb<Op, OpStr, OpNode, SwapLR, sched.ZMM, VTI.info512>,
                                     EVEX_V512;
   let Predicates = [HasVBMI2, HasVLX] in {
-    defm Z256 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.YMM, VTI.info256>,
+    defm Z256 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, SwapLR, sched.YMM, VTI.info256>,
                                     EVEX_V256;
-    defm Z128 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.XMM, VTI.info128>,
+    defm Z128 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, SwapLR, sched.XMM, VTI.info128>,
                                     EVEX_V128;
   }
 }
 multiclass VBMI2_shift_var<bits<8> wOp, bits<8> dqOp, string Prefix,
-                           SDNode OpNode, X86SchedWriteWidths sched> {
-  defm W : VBMI2_shift_var_rm_common<wOp, Prefix#"w", OpNode, sched,
+                           SDNode OpNode, bit SwapLR, X86SchedWriteWidths sched> {
+  defm W : VBMI2_shift_var_rm_common<wOp, Prefix#"w", OpNode, SwapLR, sched,
              avx512vl_i16_info>, REX_W, EVEX_CD8<16, CD8VF>;
-  defm D : VBMI2_shift_var_rmb_common<dqOp, Prefix#"d", OpNode, sched,
+  defm D : VBMI2_shift_var_rmb_common<dqOp, Prefix#"d", OpNode, SwapLR, sched,
              avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
-  defm Q : VBMI2_shift_var_rmb_common<dqOp, Prefix#"q", OpNode, sched,
+  defm Q : VBMI2_shift_var_rmb_common<dqOp, Prefix#"q", OpNode, SwapLR, sched,
              avx512vl_i64_info>, REX_W, EVEX_CD8<64, CD8VF>;
 }
 
@@ -12381,8 +12385,8 @@ multiclass VBMI2_shift_imm<bits<8> wOp, bits<8> dqOp, string Prefix,
 }
 
 // Concat & Shift
-defm VPSHLDV : VBMI2_shift_var<0x70, 0x71, "vpshldv", X86VShldv, SchedWriteVecIMul>;
-defm VPSHRDV : VBMI2_shift_var<0x72, 0x73, "vpshrdv", X86VShrdv, SchedWriteVecIMul>;
+defm VPSHLDV : VBMI2_shift_var<0x70, 0x71, "vpshldv", fshl, 0, SchedWriteVecIMul>;
+defm VPSHRDV : VBMI2_shift_var<0x72, 0x73, "vpshrdv", fshr, 1, SchedWriteVecIMul>;
 defm VPSHLD  : VBMI2_shift_imm<0x70, 0x71, "vpshld", X86VShld, SchedWriteVecIMul>;
 defm VPSHRD  : VBMI2_shift_imm<0x72, 0x73, "vpshrd", X86VShrd, SchedWriteVecIMul>;
 
diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index b4768590..031fdc1 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -25,18 +25,12 @@ let SchedRW = [WriteLEA] in {
                      [(set GR32:$dst, lea32addr:$src)]>,
                      OpSize32, Requires<[Not64BitMode]>;
 
-  let Predicates = [HasNDD], isCodeGenOnly = 1 in {
-    def LEA64_8r : I<0x8D, MRMSrcMem, (outs GR8:$dst), (ins lea64_8mem:$src),
-                     "lea{b}\t{$src|$dst}, {$dst|$src}",
-                     [(set GR8:$dst, lea64_iaddr:$src)]>,
-                   OpSize16,
-                   Requires<[In64BitMode]>;
-
-    def LEA64_16r : I<0x8D, MRMSrcMem, (outs GR16:$dst), (ins lea64_16mem:$src),
-                      "lea{w}\t{$src|$dst}, {$dst|$src}",
-                      [(set GR16:$dst, lea64_iaddr:$src)]>,
-                    OpSize16,
-                    Requires<[In64BitMode]>;
+  let isCodeGenOnly = 1 in {
+    def LEA64_8r : I<0x8D, MRMSrcMem, (outs GR32:$dst), (ins lea64_8mem:$src),
+                     "lea{l}\t{$src|$dst}, {$dst|$src}", []>, OpSize32;
+
+    def LEA64_16r : I<0x8D, MRMSrcMem, (outs GR32:$dst), (ins lea64_16mem:$src),
+                      "lea{l}\t{$src|$dst}, {$dst|$src}", []>, OpSize32;
   }
 
   def LEA64_32r : I<0x8D, MRMSrcMem, (outs GR32:$dst), (ins lea64_32mem:$src),
@@ -51,6 +45,11 @@ let SchedRW = [WriteLEA] in {
                       [(set GR64:$dst, lea64addr:$src)]>;
 } // SchedRW
 
+let Predicates = [HasNDD] in {
+  def : Pat<(i8 lea64_iaddr:$src), (EXTRACT_SUBREG (LEA64_8r lea64_8mem:$src), sub_8bit)>;
+  def : Pat<(i16 lea64_iaddr:$src), (EXTRACT_SUBREG (LEA64_16r lea64_16mem:$src), sub_16bit)>;
+}
+
 // Pseudo instruction for lea that prevent optimizer from eliminating
 // the instruction.
 let SchedRW = [WriteLEA], isPseudo = true, hasSideEffects = 1 in {
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index 0c20ffe..5321ecf 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -406,16 +406,6 @@ def X86VAlign  : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>;
 
 def X86VShld   : SDNode<"X86ISD::VSHLD", SDTShuff3OpI>;
 def X86VShrd   : SDNode<"X86ISD::VSHRD", SDTShuff3OpI>;
-def X86VShldv  : SDNode<"X86ISD::VSHLDV",
-                        SDTypeProfile<1, 3, [SDTCisVec<0>,
-                                             SDTCisSameAs<0,1>,
-                                             SDTCisSameAs<0,2>,
-                                             SDTCisSameAs<0,3>]>>;
-def X86VShrdv  : SDNode<"X86ISD::VSHRDV",
-                        SDTypeProfile<1, 3, [SDTCisVec<0>,
-                                             SDTCisSameAs<0,1>,
-                                             SDTCisSameAs<0,2>,
-                                             SDTCisSameAs<0,3>]>>;
 
 def X86Conflict : SDNode<"X86ISD::CONFLICT", SDTIntUnaryOp>;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index d1ca0a6..59e103cd 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -880,11 +880,11 @@ Instruction *InstCombinerImpl::foldAddWithConstant(BinaryOperator &Add) {
   // zext(bool) + C -> bool ? C + 1 : C
   if (match(Op0, m_ZExt(m_Value(X))) &&
       X->getType()->getScalarSizeInBits() == 1)
-    return SelectInst::Create(X, InstCombiner::AddOne(Op1C), Op1);
+    return createSelectInst(X, InstCombiner::AddOne(Op1C), Op1);
   // sext(bool) + C -> bool ? C - 1 : C
   if (match(Op0, m_SExt(m_Value(X))) &&
       X->getType()->getScalarSizeInBits() == 1)
-    return SelectInst::Create(X, InstCombiner::SubOne(Op1C), Op1);
+    return createSelectInst(X, InstCombiner::SubOne(Op1C), Op1);
 
   // ~X + C --> (C-1) - X
   if (match(Op0, m_Not(m_Value(X)))) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 7a979c1..4f94aa2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -23,6 +23,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/KnownBits.h"
@@ -62,14 +63,14 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
       public InstVisitor<InstCombinerImpl, Instruction *> {
 public:
   InstCombinerImpl(InstructionWorklist &Worklist, BuilderTy &Builder,
-                   bool MinimizeSize, AAResults *AA, AssumptionCache &AC,
+                   Function &F, AAResults *AA, AssumptionCache &AC,
                    TargetLibraryInfo &TLI, TargetTransformInfo &TTI,
                    DominatorTree &DT, OptimizationRemarkEmitter &ORE,
                    BlockFrequencyInfo *BFI, BranchProbabilityInfo *BPI,
                    ProfileSummaryInfo *PSI, const DataLayout &DL,
                    ReversePostOrderTraversal<BasicBlock *> &RPOT)
-      : InstCombiner(Worklist, Builder, MinimizeSize, AA, AC, TLI, TTI, DT, ORE,
-                     BFI, BPI, PSI, DL, RPOT) {}
+      : InstCombiner(Worklist, Builder, F, AA, AC, TLI, TTI, DT, ORE, BFI, BPI,
+                     PSI, DL, RPOT) {}
 
   virtual ~InstCombinerImpl() = default;
 
@@ -469,6 +470,17 @@ private:
   Value *simplifyNonNullOperand(Value *V, bool HasDereferenceable,
                                 unsigned Depth = 0);
 
+  SelectInst *createSelectInst(Value *C, Value *S1, Value *S2,
+                               const Twine &NameStr = "",
+                               InsertPosition InsertBefore = nullptr,
+                               Instruction *MDFrom = nullptr) {
+    SelectInst *SI =
+        SelectInst::Create(C, S1, S2, NameStr, InsertBefore, MDFrom);
+    if (!MDFrom)
+      setExplicitlyUnknownBranchWeightsIfProfiled(*SI, F, DEBUG_TYPE);
+    return SI;
+  }
+
 public:
   /// Create and insert the idiom we use to indicate a block is unreachable
   /// without having to rewrite the CFG from within InstCombine.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 550f095..d457e0c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -1253,7 +1253,7 @@ Instruction *InstCombinerImpl::visitShl(BinaryOperator &I) {
     // shl (zext i1 X), C1 --> select (X, 1 << C1, 0)
     if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
       auto *NewC = Builder.CreateShl(ConstantInt::get(Ty, 1), C1);
-      return SelectInst::Create(X, NewC, ConstantInt::getNullValue(Ty));
+      return createSelectInst(X, NewC, ConstantInt::getNullValue(Ty));
     }
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index f0ddd5c..8fbaf68 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1735,7 +1735,7 @@ Instruction *InstCombinerImpl::foldBinopOfSextBoolToSelect(BinaryOperator &BO) {
   Constant *Zero = ConstantInt::getNullValue(BO.getType());
   Value *TVal = Builder.CreateBinOp(BO.getOpcode(), Ones, C);
   Value *FVal = Builder.CreateBinOp(BO.getOpcode(), Zero, C);
-  return SelectInst::Create(X, TVal, FVal);
+  return createSelectInst(X, TVal, FVal);
 }
 
 static Value *simplifyOperationIntoSelectOperand(Instruction &I, SelectInst *SI,
@@ -5934,8 +5934,8 @@ static bool combineInstructionsOverFunction(
     LLVM_DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
                       << F.getName() << "\n");
 
-    InstCombinerImpl IC(Worklist, Builder, F.hasMinSize(), AA, AC, TLI, TTI, DT,
-                        ORE, BFI, BPI, PSI, DL, RPOT);
+    InstCombinerImpl IC(Worklist, Builder, F, AA, AC, TLI, TTI, DT, ORE, BFI,
+                        BPI, PSI, DL, RPOT);
     IC.MaxArraySizeForCombine = MaxArraySize;
     bool MadeChangeInThisIteration = IC.prepareWorklist(F);
     MadeChangeInThisIteration |= IC.run();
diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index e5bf2d1..d842275 100644
--- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -35,6 +35,7 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Regex.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation/CFGMST.h"
 #include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
@@ -92,8 +93,10 @@ class GCOVFunction;
 
 class GCOVProfiler {
 public:
-  GCOVProfiler() : GCOVProfiler(GCOVOptions::getDefault()) {}
-  GCOVProfiler(const GCOVOptions &Opts) : Options(Opts) {}
+  GCOVProfiler()
+      : GCOVProfiler(GCOVOptions::getDefault(), *vfs::getRealFileSystem()) {}
+  GCOVProfiler(const GCOVOptions &Opts, vfs::FileSystem &VFS)
+      : Options(Opts), VFS(VFS) {}
   bool
   runOnModule(Module &M, function_ref<BlockFrequencyInfo *(Function &F)> GetBFI,
               function_ref<BranchProbabilityInfo *(Function &F)> GetBPI,
@@ -110,6 +113,7 @@ public:
     os->write_zeros(4 - s.size() % 4);
   }
   void writeBytes(const char *Bytes, int Size) { os->write(Bytes, Size); }
+  vfs::FileSystem &getVirtualFileSystem() const { return VFS; }
 
 private:
   // Create the .gcno files for the Module based on DebugInfo.
@@ -166,6 +170,7 @@ private:
   std::vector<Regex> ExcludeRe;
   DenseSet<const BasicBlock *> ExecBlocks;
   StringMap<bool> InstrumentedFiles;
+  vfs::FileSystem &VFS;
 };
 
 struct BBInfo {
@@ -214,10 +219,10 @@ static StringRef getFunctionName(const DISubprogram *SP) {
 /// Prefer relative paths in the coverage notes. Clang also may split
 /// up absolute paths into a directory and filename component. When
 /// the relative path doesn't exist, reconstruct the absolute path.
-static SmallString<128> getFilename(const DIScope *SP) {
+static SmallString<128> getFilename(const DIScope *SP, vfs::FileSystem &VFS) {
   SmallString<128> Path;
   StringRef RelPath = SP->getFilename();
-  if (sys::fs::exists(RelPath))
+  if (VFS.exists(RelPath))
     Path = RelPath;
   else
     sys::path::append(Path, SP->getDirectory(), SP->getFilename());
@@ -357,7 +362,7 @@ namespace {
 
     void writeOut(uint32_t CfgChecksum) {
       write(GCOV_TAG_FUNCTION);
-      SmallString<128> Filename = getFilename(SP);
+      SmallString<128> Filename = getFilename(SP, P->getVirtualFileSystem());
       uint32_t BlockLen = 3 + wordsOfString(getFunctionName(SP));
       BlockLen += 1 + wordsOfString(Filename) + 4;
 
@@ -455,7 +460,7 @@ bool GCOVProfiler::isFunctionInstrumented(const Function &F) {
   if (FilterRe.empty() && ExcludeRe.empty()) {
     return true;
   }
-  SmallString<128> Filename = getFilename(F.getSubprogram());
+  SmallString<128> Filename = getFilename(F.getSubprogram(), VFS);
   auto It = InstrumentedFiles.find(Filename);
   if (It != InstrumentedFiles.end()) {
     return It->second;
@@ -467,7 +472,7 @@ bool GCOVProfiler::isFunctionInstrumented(const Function &F) {
   // Path can be
   // /usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/c++/8/bits/*.h so for
   // such a case we must get the real_path.
-  if (sys::fs::real_path(Filename, RealPath)) {
+  if (VFS.getRealPath(Filename, RealPath)) {
     // real_path can fail with path like "foo.c".
     RealFilename = Filename;
   } else {
@@ -524,9 +529,10 @@ std::string GCOVProfiler::mangleName(const DICompileUnit *CU,
   SmallString<128> Filename = CU->getFilename();
   sys::path::replace_extension(Filename, Notes ? "gcno" : "gcda");
   StringRef FName = sys::path::filename(Filename);
-  SmallString<128> CurPath;
-  if (sys::fs::current_path(CurPath))
+  ErrorOr<std::string> CWD = VFS.getCurrentWorkingDirectory();
+  if (!CWD)
     return std::string(FName);
+  SmallString<128> CurPath{*CWD};
   sys::path::append(CurPath, FName);
   return std::string(CurPath);
 }
@@ -554,7 +560,7 @@ bool GCOVProfiler::runOnModule(
 PreservedAnalyses GCOVProfilerPass::run(Module &M,
                                         ModuleAnalysisManager &AM) {
 
-  GCOVProfiler Profiler(GCOVOpts);
+  GCOVProfiler Profiler(GCOVOpts, *VFS);
   FunctionAnalysisManager &FAM =
       AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
 
@@ -789,7 +795,7 @@ bool GCOVProfiler::emitProfileNotes(
       // Add the function line number to the lines of the entry block
       // to have a counter for the function definition.
       uint32_t Line = SP->getLine();
-      auto Filename = getFilename(SP);
+      auto Filename = getFilename(SP, VFS);
 
       BranchProbabilityInfo *BPI = GetBPI(F);
       BlockFrequencyInfo *BFI = GetBFI(F);
@@ -881,7 +887,7 @@ bool GCOVProfiler::emitProfileNotes(
           if (SP != getDISubprogram(Scope))
             continue;
 
-          GCOVLines &Lines = Block.getFile(getFilename(Loc->getScope()));
+          GCOVLines &Lines = Block.getFile(getFilename(Loc->getScope(), VFS));
           Lines.addLine(Loc.getLine());
         }
         Line = 0;
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 2d84b4a..216bdf4 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -84,7 +84,6 @@
 #include <cstdint>
 #include <iterator>
 #include <map>
-#include <numeric>
 #include <optional>
 #include <set>
 #include <tuple>
@@ -6356,25 +6355,25 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector,
     if (DefaultResult) {
       Value *ValueCompare =
           Builder.CreateICmpEQ(Condition, SecondCase, "switch.selectcmp");
-      SelectInst *SelectValueInst = cast<SelectInst>(Builder.CreateSelect(
-          ValueCompare, ResultVector[1].first, DefaultResult, "switch.select"));
-      SelectValue = SelectValueInst;
-      if (HasBranchWeights) {
+      SelectValue = Builder.CreateSelect(ValueCompare, ResultVector[1].first,
+                                         DefaultResult, "switch.select");
+      if (auto *SI = dyn_cast<SelectInst>(SelectValue);
+          SI && HasBranchWeights) {
         // We start with 3 probabilities, where the numerator is the
         // corresponding BranchWeights[i], and the denominator is the sum over
         // BranchWeights. We want the probability and negative probability of
         // Condition == SecondCase.
         assert(BranchWeights.size() == 3);
-        setBranchWeights(SelectValueInst, BranchWeights[2],
+        setBranchWeights(SI, BranchWeights[2],
                          BranchWeights[0] + BranchWeights[1],
                          /*IsExpected=*/false);
       }
     }
     Value *ValueCompare =
         Builder.CreateICmpEQ(Condition, FirstCase, "switch.selectcmp");
-    SelectInst *Ret = cast<SelectInst>(Builder.CreateSelect(
-        ValueCompare, ResultVector[0].first, SelectValue, "switch.select"));
-    if (HasBranchWeights) {
+    Value *Ret = Builder.CreateSelect(ValueCompare, ResultVector[0].first,
+                                      SelectValue, "switch.select");
+    if (auto *SI = dyn_cast<SelectInst>(Ret); SI && HasBranchWeights) {
       // We may have had a DefaultResult. Base the position of the first and
       // second's branch weights accordingly. Also the proability that Condition
       // != FirstCase needs to take that into account.
@@ -6382,7 +6381,7 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector,
       size_t FirstCasePos = (Condition != nullptr);
       size_t SecondCasePos = FirstCasePos + 1;
       uint32_t DefaultCase = (Condition != nullptr) ? BranchWeights[0] : 0;
-      setBranchWeights(Ret, BranchWeights[FirstCasePos],
+      setBranchWeights(SI, BranchWeights[FirstCasePos],
                        DefaultCase + BranchWeights[SecondCasePos],
                        /*IsExpected=*/false);
     }
@@ -6422,13 +6421,13 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector,
           Value *And = Builder.CreateAnd(Condition, AndMask);
           Value *Cmp = Builder.CreateICmpEQ(
               And, Constant::getIntegerValue(And->getType(), AndMask));
-          SelectInst *Ret = cast<SelectInst>(
-              Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult));
-          if (HasBranchWeights) {
+          Value *Ret =
+              Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult);
+          if (auto *SI = dyn_cast<SelectInst>(Ret); SI && HasBranchWeights) {
             // We know there's a Default case. We base the resulting branch
             // weights off its probability.
             assert(BranchWeights.size() >= 2);
-            setBranchWeights(Ret, accumulate(drop_begin(BranchWeights), 0),
+            setBranchWeights(SI, accumulate(drop_begin(BranchWeights), 0),
                              BranchWeights[0], /*IsExpected=*/false);
           }
           return Ret;
@@ -6448,11 +6447,11 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector,
         Value *And = Builder.CreateAnd(Condition, ~BitMask, "switch.and");
         Value *Cmp = Builder.CreateICmpEQ(
             And, Constant::getNullValue(And->getType()), "switch.selectcmp");
-        SelectInst *Ret = cast<SelectInst>(
-            Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult));
-        if (HasBranchWeights) {
+        Value *Ret =
+            Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult);
+        if (auto *SI = dyn_cast<SelectInst>(Ret); SI && HasBranchWeights) {
           assert(BranchWeights.size() >= 2);
-          setBranchWeights(Ret, accumulate(drop_begin(BranchWeights), 0),
+          setBranchWeights(SI, accumulate(drop_begin(BranchWeights), 0),
                            BranchWeights[0], /*IsExpected=*/false);
         }
         return Ret;
@@ -6466,11 +6465,11 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector,
       Value *Cmp2 = Builder.CreateICmpEQ(Condition, CaseValues[1],
                                          "switch.selectcmp.case2");
       Value *Cmp = Builder.CreateOr(Cmp1, Cmp2, "switch.selectcmp");
-      SelectInst *Ret = cast<SelectInst>(
-          Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult));
-      if (HasBranchWeights) {
+      Value *Ret =
+          Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult);
+      if (auto *SI = dyn_cast<SelectInst>(Ret); SI && HasBranchWeights) {
         assert(BranchWeights.size() >= 2);
-        setBranchWeights(Ret, accumulate(drop_begin(BranchWeights), 0),
+        setBranchWeights(SI, accumulate(drop_begin(BranchWeights), 0),
                          BranchWeights[0], /*IsExpected=*/false);
       }
       return Ret;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 065622e..c547662 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1100,7 +1100,9 @@ class BinOpSameOpcodeHelper {
       // constant + x cannot be -constant - x
       // instead, it should be x - -constant
       if (Pos == 1 ||
-          (FromOpcode == Instruction::Add && ToOpcode == Instruction::Sub))
+          ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
+            FromOpcode == Instruction::Xor) &&
+           ToOpcode == Instruction::Sub))
         return SmallVector<Value *>({LHS, RHS});
       return SmallVector<Value *>({RHS, LHS});
     }
@@ -1188,6 +1190,10 @@ public:
         if (CIValue.isAllOnes())
           InterchangeableMask = CanBeAll;
         break;
+      case Instruction::Xor:
+        if (CIValue.isZero())
+          InterchangeableMask = XorBIT | OrBIT | AndBIT | SubBIT | AddBIT;
+        break;
       default:
         if (CIValue.isZero())
           InterchangeableMask = CanBeAll;
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 0ef933f..32704bd 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2487,21 +2487,31 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
   if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
     return false;
 
+  // Check whether this is a binary shuffle.
+  bool IsBinaryShuffle = !isa<UndefValue>(V1);
+
   auto *C0 = dyn_cast<CastInst>(V0);
   auto *C1 = dyn_cast<CastInst>(V1);
-  if (!C0 || !C1)
+  if (!C0 || (IsBinaryShuffle && !C1))
     return false;
 
   Instruction::CastOps Opcode = C0->getOpcode();
-  if (C0->getSrcTy() != C1->getSrcTy())
+
+  // If this is allowed, foldShuffleOfCastops can get stuck in a loop
+  // with foldBitcastOfShuffle. Reject in favor of foldBitcastOfShuffle.
+  if (!IsBinaryShuffle && Opcode == Instruction::BitCast)
     return false;
 
-  // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
-  if (Opcode != C1->getOpcode()) {
-    if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
-      Opcode = Instruction::SExt;
-    else
+  if (IsBinaryShuffle) {
+    if (C0->getSrcTy() != C1->getSrcTy())
       return false;
+    // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
+    if (Opcode != C1->getOpcode()) {
+      if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
+        Opcode = Instruction::SExt;
+      else
+        return false;
+    }
   }
 
   auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
@@ -2544,23 +2554,31 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
   InstructionCost CostC0 =
       TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
                            TTI::CastContextHint::None, CostKind);
-  InstructionCost CostC1 =
-      TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
-                           TTI::CastContextHint::None, CostKind);
-  InstructionCost OldCost = CostC0 + CostC1;
-  OldCost +=
-      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleDstTy,
-                         CastDstTy, OldMask, CostKind, 0, nullptr, {}, &I);
 
-  InstructionCost NewCost =
-      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, NewShuffleDstTy,
-                         CastSrcTy, NewMask, CostKind);
+  TargetTransformInfo::ShuffleKind ShuffleKind;
+  if (IsBinaryShuffle)
+    ShuffleKind = TargetTransformInfo::SK_PermuteTwoSrc;
+  else
+    ShuffleKind = TargetTransformInfo::SK_PermuteSingleSrc;
+
+  InstructionCost OldCost = CostC0;
+  OldCost += TTI.getShuffleCost(ShuffleKind, ShuffleDstTy, CastDstTy, OldMask,
+                                CostKind, 0, nullptr, {}, &I);
+
+  InstructionCost NewCost = TTI.getShuffleCost(ShuffleKind, NewShuffleDstTy,
+                                               CastSrcTy, NewMask, CostKind);
   NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
                                   TTI::CastContextHint::None, CostKind);
   if (!C0->hasOneUse())
     NewCost += CostC0;
-  if (!C1->hasOneUse())
-    NewCost += CostC1;
+  if (IsBinaryShuffle) {
+    InstructionCost CostC1 =
+        TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
+                             TTI::CastContextHint::None, CostKind);
+    OldCost += CostC1;
+    if (!C1->hasOneUse())
+      NewCost += CostC1;
+  }
 
   LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
                     << "\n  OldCost: " << OldCost << " vs NewCost: " << NewCost
@@ -2568,14 +2586,20 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
   if (NewCost > OldCost)
     return false;
 
-  Value *Shuf = Builder.CreateShuffleVector(C0->getOperand(0),
-                                            C1->getOperand(0), NewMask);
+  Value *Shuf;
+  if (IsBinaryShuffle)
+    Shuf = Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0),
+                                       NewMask);
+  else
+    Shuf = Builder.CreateShuffleVector(C0->getOperand(0), NewMask);
+
   Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
 
   // Intersect flags from the old casts.
   if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
     NewInst->copyIRFlags(C0);
-    NewInst->andIRFlags(C1);
+    if (IsBinaryShuffle)
+      NewInst->andIRFlags(C1);
   }
 
   Worklist.pushValue(Shuf);
@@ -4433,7 +4457,7 @@ bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
 
   // Create new mask using difference of the two incoming masks.
   int MaskOffset = NewMask[0u];
-  unsigned Index = (InputNumElements - MaskOffset) % InputNumElements;
+  unsigned Index = (InputNumElements + MaskOffset) % InputNumElements;
   NewMask.clear();
 
   for (unsigned I = 0u; I < InputNumElements; ++I) {