49 files changed, 1486 insertions, 204 deletions
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 1ec216b..f0dd7ba 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -620,8 +620,8 @@ void DWARFRewriter::updateDebugInfo() {
   uint32_t CUIndex = 0;
   std::mutex AccessMutex;
   // Needs to be invoked in the same order as CUs are processed.
-  auto createRangeLocListAddressWriters =
-      [&](DWARFUnit &CU) -> DebugLocWriter * {
+  llvm::DenseMap<uint64_t, uint64_t> LocListWritersIndexByCU;
+  auto createRangeLocListAddressWriters = [&](DWARFUnit &CU) {
     std::lock_guard<std::mutex> Lock(AccessMutex);
     const uint16_t DwarfVersion = CU.getVersion();
     if (DwarfVersion >= 5) {
@@ -641,7 +641,6 @@ void DWARFRewriter::updateDebugInfo() {
         RangeListsWritersByCU[*DWOId] = std::move(DWORangeListsSectionWriter);
       }
       AddressWritersByCU[CU.getOffset()] = std::move(AddrW);
-
     } else {
       auto AddrW =
           std::make_unique<DebugAddrWriter>(&BC, CU.getAddressByteSize());
@@ -657,7 +656,7 @@ void DWARFRewriter::updateDebugInfo() {
             std::move(LegacyRangesSectionWriterByCU);
       }
     }
-    return LocListWritersByCU[CUIndex++].get();
+    LocListWritersIndexByCU[CU.getOffset()] = CUIndex++;
   };
 
   DWARF5AcceleratorTable DebugNamesTable(opts::CreateDebugNames, BC,
@@ -666,74 +665,70 @@ void DWARFRewriter::updateDebugInfo() {
   DWPState State;
   if (opts::WriteDWP)
     initDWPState(State);
-  auto processUnitDIE = [&](DWARFUnit *Unit, DIEBuilder *DIEBlder) {
-    // Check if the unit is a skeleton and we need special updates for it and
-    // its matching split/DWO CU.
+  auto processSplitCU = [&](DWARFUnit &Unit, DWARFUnit &SplitCU,
+                            DIEBuilder &DIEBlder,
+                            DebugRangesSectionWriter &TempRangesSectionWriter,
+                            DebugAddrWriter &AddressWriter) {
+    DIEBuilder DWODIEBuilder(BC, &(SplitCU).getContext(), DebugNamesTable,
+                             &Unit);
+    DWODIEBuilder.buildDWOUnit(SplitCU);
+    std::string DWOName = "";
+    std::optional<std::string> DwarfOutputPath =
+        opts::DwarfOutputPath.empty()
+            ? std::nullopt
+            : std::optional<std::string>(opts::DwarfOutputPath.c_str());
+    {
+      std::lock_guard<std::mutex> Lock(AccessMutex);
+      DWOName = DIEBlder.updateDWONameCompDir(
+          *StrOffstsWriter, *StrWriter, Unit, DwarfOutputPath, std::nullopt);
+    }
+    DebugStrOffsetsWriter DWOStrOffstsWriter(BC);
+    DebugStrWriter DWOStrWriter((SplitCU).getContext(), true);
+    DWODIEBuilder.updateDWONameCompDirForTypes(
+        DWOStrOffstsWriter, DWOStrWriter, SplitCU, DwarfOutputPath, DWOName);
+    DebugLoclistWriter DebugLocDWoWriter(Unit, Unit.getVersion(), true,
+                                         AddressWriter);
+
+    updateUnitDebugInfo(SplitCU, DWODIEBuilder, DebugLocDWoWriter,
+                        TempRangesSectionWriter, AddressWriter);
+    DebugLocDWoWriter.finalize(DWODIEBuilder,
+                               *DWODIEBuilder.getUnitDIEbyUnit(SplitCU));
+    if (Unit.getVersion() >= 5)
+      TempRangesSectionWriter.finalizeSection();
+
+    emitDWOBuilder(DWOName, DWODIEBuilder, *this, SplitCU, Unit, State,
+                   DebugLocDWoWriter, DWOStrOffstsWriter, DWOStrWriter,
+                   GDBIndexSection);
+  };
+  auto processMainBinaryCU = [&](DWARFUnit &Unit, DIEBuilder &DIEBlder) {
     std::optional<DWARFUnit *> SplitCU;
     std::optional<uint64_t> RangesBase;
-    std::optional<uint64_t> DWOId = Unit->getDWOId();
+    std::optional<uint64_t> DWOId = Unit.getDWOId();
     if (DWOId)
       SplitCU = BC.getDWOCU(*DWOId);
-    DebugLocWriter *DebugLocWriter = createRangeLocListAddressWriters(*Unit);
-    DebugRangesSectionWriter *RangesSectionWriter =
-        Unit->getVersion() >= 5 ? RangeListsSectionWriter.get()
-                                : LegacyRangesSectionWriter.get();
-    DebugAddrWriter *AddressWriter =
-        AddressWritersByCU[Unit->getOffset()].get();
-    // Skipping CUs that failed to load.
-    if (SplitCU) {
-      DIEBuilder DWODIEBuilder(BC, &(*SplitCU)->getContext(), DebugNamesTable,
-                               Unit);
-      DWODIEBuilder.buildDWOUnit(**SplitCU);
-      std::string DWOName = "";
-      std::optional<std::string> DwarfOutputPath =
-          opts::DwarfOutputPath.empty()
-              ? std::nullopt
-              : std::optional<std::string>(opts::DwarfOutputPath.c_str());
-      {
-        std::lock_guard<std::mutex> Lock(AccessMutex);
-        DWOName = DIEBlder->updateDWONameCompDir(
-            *StrOffstsWriter, *StrWriter, *Unit, DwarfOutputPath, std::nullopt);
-      }
-      DebugStrOffsetsWriter DWOStrOffstsWriter(BC);
-      DebugStrWriter DWOStrWriter((*SplitCU)->getContext(), true);
-      DWODIEBuilder.updateDWONameCompDirForTypes(DWOStrOffstsWriter,
-                                                 DWOStrWriter, **SplitCU,
-                                                 DwarfOutputPath, DWOName);
-      DebugLoclistWriter DebugLocDWoWriter(*Unit, Unit->getVersion(), true,
-                                           *AddressWriter);
-      DebugRangesSectionWriter *TempRangesSectionWriter = RangesSectionWriter;
-      if (Unit->getVersion() >= 5) {
-        TempRangesSectionWriter = RangeListsWritersByCU[*DWOId].get();
-      } else {
-        TempRangesSectionWriter = LegacyRangesWritersByCU[*DWOId].get();
-        RangesBase = RangesSectionWriter->getSectionOffset();
-      }
-
-      updateUnitDebugInfo(*(*SplitCU), DWODIEBuilder, DebugLocDWoWriter,
-                          *TempRangesSectionWriter, *AddressWriter);
-      DebugLocDWoWriter.finalize(DWODIEBuilder,
-                                 *DWODIEBuilder.getUnitDIEbyUnit(**SplitCU));
-      if (Unit->getVersion() >= 5)
-        TempRangesSectionWriter->finalizeSection();
-
-      emitDWOBuilder(DWOName, DWODIEBuilder, *this, **SplitCU, *Unit, State,
-                     DebugLocDWoWriter, DWOStrOffstsWriter, DWOStrWriter,
-                     GDBIndexSection);
-    }
-
-    if (Unit->getVersion() >= 5) {
-      RangesBase = RangesSectionWriter->getSectionOffset() +
+    DebugLocWriter &DebugLocWriter =
+        *LocListWritersByCU[LocListWritersIndexByCU[Unit.getOffset()]].get();
+    DebugRangesSectionWriter &RangesSectionWriter =
+        Unit.getVersion() >= 5 ? *RangeListsSectionWriter.get()
+                               : *LegacyRangesSectionWriter.get();
+    DebugAddrWriter &AddressWriter =
+        *AddressWritersByCU[Unit.getOffset()].get();
+    if (Unit.getVersion() >= 5)
+      RangeListsSectionWriter->setAddressWriter(&AddressWriter);
+    if (Unit.getVersion() >= 5) {
+      RangesBase = RangesSectionWriter.getSectionOffset() +
                    getDWARF5RngListLocListHeaderSize();
-      RangesSectionWriter->initSection(*Unit);
-      StrOffstsWriter->finalizeSection(*Unit, *DIEBlder);
+      RangesSectionWriter.initSection(Unit);
+      StrOffstsWriter->finalizeSection(Unit, DIEBlder);
+    } else if (SplitCU) {
+      RangesBase = LegacyRangesSectionWriter.get()->getSectionOffset();
     }
 
-    updateUnitDebugInfo(*Unit, *DIEBlder, *DebugLocWriter, *RangesSectionWriter,
-                        *AddressWriter, RangesBase);
-    DebugLocWriter->finalize(*DIEBlder, *DIEBlder->getUnitDIEbyUnit(*Unit));
-    if (Unit->getVersion() >= 5)
-      RangesSectionWriter->finalizeSection();
+    updateUnitDebugInfo(Unit, DIEBlder, DebugLocWriter, RangesSectionWriter,
+                        AddressWriter, RangesBase);
+    DebugLocWriter.finalize(DIEBlder, *DIEBlder.getUnitDIEbyUnit(Unit));
+    if (Unit.getVersion() >= 5)
+      RangesSectionWriter.finalizeSection();
   };
 
   DIEBuilder DIEBlder(BC, BC.DwCtx.get(), DebugNamesTable);
@@ -751,8 +746,24 @@ void DWARFRewriter::updateDebugInfo() {
   CUPartitionVector PartVec = partitionCUs(*BC.DwCtx);
   for (std::vector<DWARFUnit *> &Vec : PartVec) {
     DIEBlder.buildCompileUnits(Vec);
+    for (DWARFUnit *CU : DIEBlder.getProcessedCUs()) {
+      createRangeLocListAddressWriters(*CU);
+      std::optional<DWARFUnit *> SplitCU;
+      std::optional<uint64_t> DWOId = CU->getDWOId();
+      if (DWOId)
+        SplitCU = BC.getDWOCU(*DWOId);
+      if (!SplitCU)
+        continue;
+      DebugAddrWriter &AddressWriter =
+          *AddressWritersByCU[CU->getOffset()].get();
+      DebugRangesSectionWriter *TempRangesSectionWriter =
+          CU->getVersion() >= 5 ? RangeListsWritersByCU[*DWOId].get()
+                                : LegacyRangesWritersByCU[*DWOId].get();
+      processSplitCU(*CU, **SplitCU, DIEBlder, *TempRangesSectionWriter,
+                     AddressWriter);
+    }
     for (DWARFUnit *CU : DIEBlder.getProcessedCUs())
-      processUnitDIE(CU, &DIEBlder);
+      processMainBinaryCU(*CU, DIEBlder);
     finalizeCompileUnits(DIEBlder, *Streamer, OffsetMap,
                          DIEBlder.getProcessedCUs(), *FinalAddrWriter);
   }
diff --git a/bolt/test/X86/dwarf5-dwarf4-types-backward-forward-cross-reference.test b/bolt/test/X86/dwarf5-dwarf4-types-backward-forward-cross-reference.test
index 070648c..b48d6a5 100644
--- a/bolt/test/X86/dwarf5-dwarf4-types-backward-forward-cross-reference.test
+++ b/bolt/test/X86/dwarf5-dwarf4-types-backward-forward-cross-reference.test
@@ -5,10 +5,11 @@
 # RUN: %clang %cflags %tmain.o %thelper.o -o %t.exe
 # RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt | FileCheck --check-prefix=POSTCHECK %s
+# RUN: llvm-dwarfdump --show-form --verbose --debug-addr %t.bolt | FileCheck --check-prefix=POSTCHECKADDR %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-types %t.bolt | FileCheck --check-prefix=POSTCHECKTU %s
 
 ## This test checks that BOLT handles correctly backward and forward cross CU references
-## for DWARF5 and DWARF4 with -fdebug-types-section
+## for DWARF5 and DWARF4 with -fdebug-types-section and checks the address table is correct.
 
 # POSTCHECK: version = 0x0005
 # POSTCHECK: DW_TAG_type_unit
@@ -29,6 +30,15 @@
 # POSTCHECK: DW_TAG_variable [20]
 # POSTCHECK: DW_AT_type [DW_FORM_ref_addr] (0x{{[0-9a-f]+}} "Foo3a")
 
+# POSTCHECKADDR: Addrs: [
+# POSTCHECKADDR-NEXT: 0x0000000000001360
+# POSTCHECKADDR-NEXT: 0x0000000000000000
+# POSTCHECKADDR-NEXT: ]
+# POSTCHECKADDR: Addrs: [
+# POSTCHECKADDR-NEXT: 0x00000000000013e0
+# POSTCHECKADDR-NEXT: 0x0000000000000000
+# POSTCHECKADDR-NEXT: ]
+
 # POSTCHECKTU: version = 0x0004
 # POSTCHECKTU: DW_TAG_type_unit
 # POSTCHECKTU: DW_TAG_structure_type
diff --git a/bolt/test/X86/dwarf5-locexpr-referrence.test b/bolt/test/X86/dwarf5-locexpr-referrence.test
index ea73d76..cc7bb27 100644
--- a/bolt/test/X86/dwarf5-locexpr-referrence.test
+++ b/bolt/test/X86/dwarf5-locexpr-referrence.test
@@ -5,8 +5,10 @@
 # RUN: %clang %cflags -dwarf-5 %tmain.o %thelper.o -o %t.exe -Wl,-q
 # RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt | FileCheck --check-prefix=CHECK %s
+# RUN: llvm-dwarfdump --show-form --verbose --debug-addr %t.bolt | FileCheck --check-prefix=CHECKADDR %s
 
-## This test checks that we update relative DIE references with DW_OP_convert that are in locexpr.
+## This test checks that we update relative DIE references with DW_OP_convert that are in locexpr
+## and checks the address table is correct.
 
 # CHECK: version = 0x0005
 # CHECK: DW_TAG_variable
@@ -19,3 +21,18 @@
 # CHECK-SAME: DW_OP_convert (0x00000028 -> 0x00000092)
 # CHECK-SAME: DW_OP_convert (0x0000002c -> 0x00000096)
 # CHECK: version = 0x0005
+
+# CHECKADDR: Addrs: [
+# CHECKADDR-NEXT: 0x0000000000001330
+# CHECKADDR-NEXT: 0x0000000000000000
+# CHECKADDR-NEXT: 0x0000000000001333
+# CHECKADDR-NEXT: ]
+# CHECKADDR: Addrs: [
+# CHECKADDR-NEXT: 0x0000000000001340
+# CHECKADDR-NEXT: 0x0000000000000000
+# CHECKADDR-NEXT: 0x0000000000001343
+# CHECKADDR-NEXT: ]
+# CHECKADDR: Addrs: [
+# CHECKADDR-NEXT: 0x0000000000001320
+# CHECKADDR-NEXT: 0x0000000000000000
+# CHECKADDR-NEXT: ]
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 69269cf..fa36405 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3232,6 +3232,10 @@ def fimplicit_module_maps : Flag <["-"], "fimplicit-module-maps">, Group<f_Group
   Visibility<[ClangOption, CC1Option, CLOption]>,
   HelpText<"Implicitly search the file system for module map files.">,
   MarshallingInfoFlag<HeaderSearchOpts<"ImplicitModuleMaps">>;
+defm modulemap_allow_subdirectory_search : BoolFOption <"modulemap-allow-subdirectory-search",
+  HeaderSearchOpts<"AllowModuleMapSubdirectorySearch">, DefaultTrue,
+  PosFlag<SetTrue, [], [], "Allow to search for module maps in subdirectories of search paths">,
+  NegFlag<SetFalse>, BothFlags<[NoXarchOption], [ClangOption, CC1Option]>>;
 defm modules : BoolFOption<"modules",
   LangOpts<"Modules">, Default<fcxx_modules.KeyPath>,
   PosFlag<SetTrue, [], [ClangOption, CC1Option],
diff --git a/clang/include/clang/Lex/HeaderSearchOptions.h b/clang/include/clang/Lex/HeaderSearchOptions.h
index 1763514..83a95e9 100644
--- a/clang/include/clang/Lex/HeaderSearchOptions.h
+++ b/clang/include/clang/Lex/HeaderSearchOptions.h
@@ -270,6 +270,12 @@ public:
   LLVM_PREFERRED_TYPE(bool)
   unsigned ModulesIncludeVFSUsage : 1;
 
+  /// Whether we should look for a module in module maps only in provided
+  /// header search paths or if we are allowed to look for module maps in
+  /// subdirectories of provided paths too.
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned AllowModuleMapSubdirectorySearch : 1;
+
   HeaderSearchOptions(StringRef _Sysroot = "/")
       : Sysroot(_Sysroot), ModuleFormat("raw"), DisableModuleHash(false),
         ImplicitModuleMaps(false), ModuleMapFileHomeIsCwd(false),
@@ -285,7 +291,8 @@ public:
         ModulesSkipHeaderSearchPaths(false),
         ModulesSkipPragmaDiagnosticMappings(false),
         ModulesPruneNonAffectingModuleMaps(true), ModulesHashContent(false),
-        ModulesStrictContextHash(false), ModulesIncludeVFSUsage(false) {}
+        ModulesStrictContextHash(false), ModulesIncludeVFSUsage(false),
+        AllowModuleMapSubdirectorySearch(true) {}
 
   /// AddPath - Add the \p Path path to the specified \p Group list.
   void AddPath(StringRef Path, frontend::IncludeDirGroup Group,
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 78936fd..bc77b98 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -3960,6 +3960,9 @@ static bool RenderModulesOptions(Compilation &C, const Driver &D,
                    options::OPT_fno_modules_strict_decluse, false))
     CmdArgs.push_back("-fmodules-strict-decluse");
 
+  Args.addOptOutFlag(CmdArgs, options::OPT_fmodulemap_allow_subdirectory_search,
+                     options::OPT_fno_modulemap_allow_subdirectory_search);
+
   // -fno-implicit-modules turns off implicitly compiling modules on demand.
   bool ImplicitModules = false;
   if (!Args.hasFlag(options::OPT_fimplicit_modules,
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 59453c4..61d12b1 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -609,6 +609,10 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(Args.MakeArgString(
         "--pxtas-path=" + Args.getLastArgValue(options::OPT_ptxas_path_EQ)));
 
+  if (Args.hasArg(options::OPT_cuda_path_EQ))
+    CmdArgs.push_back(Args.MakeArgString(
+        "--cuda-path=" + Args.getLastArgValue(options::OPT_cuda_path_EQ)));
+
   // Add paths specified in LIBRARY_PATH environment variable as -L options.
   addDirectoryList(Args, CmdArgs, "-L", "LIBRARY_PATH");
 
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index c6f9d7b..17b6074 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -3036,6 +3036,35 @@ void Darwin::addClangTargetOptions(
   if (!DriverArgs.hasArgNoClaim(options::OPT_fdefine_target_os_macros,
                                 options::OPT_fno_define_target_os_macros))
     CC1Args.push_back("-fdefine-target-os-macros");
+
+  // Disable subdirectory modulemap search on sufficiently recent SDKs.
+  if (SDKInfo &&
+      !DriverArgs.hasFlag(options::OPT_fmodulemap_allow_subdirectory_search,
+                          options::OPT_fno_modulemap_allow_subdirectory_search,
+                          false)) {
+    bool RequiresSubdirectorySearch;
+    VersionTuple SDKVersion = SDKInfo->getVersion();
+    switch (TargetPlatform) {
+    default:
+      RequiresSubdirectorySearch = true;
+      break;
+    case MacOS:
+      RequiresSubdirectorySearch = SDKVersion < VersionTuple(15, 0);
+      break;
+    case IPhoneOS:
+    case TvOS:
+      RequiresSubdirectorySearch = SDKVersion < VersionTuple(18, 0);
+      break;
+    case WatchOS:
+      RequiresSubdirectorySearch = SDKVersion < VersionTuple(11, 0);
+      break;
+    case XROS:
+      RequiresSubdirectorySearch = SDKVersion < VersionTuple(2, 0);
+      break;
+    }
+    if (!RequiresSubdirectorySearch)
+      CC1Args.push_back("-fno-modulemap-allow-subdirectory-search");
+  }
 }
 
 void Darwin::addClangCC1ASTargetOptions(
diff --git a/clang/lib/Headers/stdarg.h b/clang/lib/Headers/stdarg.h
index 8292ab9..6203d7a 100644
--- a/clang/lib/Headers/stdarg.h
+++ b/clang/lib/Headers/stdarg.h
@@ -20,19 +20,18 @@
  * modules.
  */
 #if defined(__MVS__) && __has_include_next(<stdarg.h>)
-#include <__stdarg_header_macro.h>
 #undef __need___va_list
 #undef __need_va_list
 #undef __need_va_arg
 #undef __need___va_copy
 #undef __need_va_copy
+#include <__stdarg_header_macro.h>
 #include_next <stdarg.h>
 
 #else
 #if !defined(__need___va_list) && !defined(__need_va_list) &&                  \
     !defined(__need_va_arg) && !defined(__need___va_copy) &&                   \
     !defined(__need_va_copy)
-#include <__stdarg_header_macro.h>
 #define __need___va_list
 #define __need_va_list
 #define __need_va_arg
@@ -45,6 +44,7 @@
     !defined(__STRICT_ANSI__)
 #define __need_va_copy
 #endif
+#include <__stdarg_header_macro.h>
 #endif
 
 #ifdef __need___va_list
diff --git a/clang/lib/Headers/stddef.h b/clang/lib/Headers/stddef.h
index 8985c52..99b275a 100644
--- a/clang/lib/Headers/stddef.h
+++ b/clang/lib/Headers/stddef.h
@@ -20,7 +20,6 @@
  * modules.
  */
 #if defined(__MVS__) && __has_include_next(<stddef.h>)
-#include <__stddef_header_macro.h>
 #undef __need_ptrdiff_t
 #undef __need_size_t
 #undef __need_rsize_t
@@ -31,6 +30,7 @@
 #undef __need_max_align_t
 #undef __need_offsetof
 #undef __need_wint_t
+#include <__stddef_header_macro.h>
 #include_next <stddef.h>
 
 #else
@@ -40,7 +40,6 @@
     !defined(__need_NULL) && !defined(__need_nullptr_t) &&                     \
     !defined(__need_unreachable) && !defined(__need_max_align_t) &&            \
     !defined(__need_offsetof) && !defined(__need_wint_t)
-#include <__stddef_header_macro.h>
 #define __need_ptrdiff_t
 #define __need_size_t
 /* ISO9899:2011 7.20 (C11 Annex K): Define rsize_t if __STDC_WANT_LIB_EXT1__ is
@@ -49,7 +48,24 @@
 #define __need_rsize_t
 #endif
 #define __need_wchar_t
+#if !defined(__STDDEF_H) || __has_feature(modules)
+/*
+ * __stddef_null.h is special when building without modules: if __need_NULL is
+ * set, then it will unconditionally redefine NULL. To avoid stepping on client
+ * definitions of NULL, __need_NULL should only be set the first time this
+ * header is included, that is when __STDDEF_H is not defined. However, when
+ * building with modules, this header is a textual header and needs to
+ * unconditionally include __stdef_null.h to support multiple submodules
+ * exporting _Builtin_stddef.null. Take module SM with submodules A and B, whose
+ * headers both include stddef.h When SM.A builds, __STDDEF_H will be defined.
+ * When SM.B builds, the definition from SM.A will leak when building without
+ * local submodule visibility. stddef.h wouldn't include __stddef_null.h, and
+ * SM.B wouldn't import _Builtin_stddef.null, and SM.B's `export *` wouldn't
+ * export NULL as expected. When building with modules, always include
+ * __stddef_null.h so that everything works as expected.
+ */
 #define __need_NULL
+#endif
 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) ||              \
     defined(__cplusplus)
 #define __need_nullptr_t
@@ -65,6 +81,7 @@
 /* wint_t is provided by <wchar.h> and not <stddef.h>. It's here
  * for compatibility, but must be explicitly requested. Therefore
  * __need_wint_t is intentionally not defined here. */
+#include <__stddef_header_macro.h>
 #endif
 
 #if defined(__need_ptrdiff_t)
diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
index 31a4c0f..088d1cc 100644
--- a/clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -914,8 +914,7 @@ bool Scanner::lexPPLine(const char *&First, const char *const End) {
   case pp_import:
     // Ignore missing filenames in include or import directives.
     if (lexIncludeFilename(First, End).is(tok::eod)) {
-      skipDirective(Id, First, End);
-      return true;
+      return false;
     }
     break;
   default:
diff --git a/clang/lib/Lex/HeaderSearch.cpp b/clang/lib/Lex/HeaderSearch.cpp
index c3b3064..d2210e7 100644
--- a/clang/lib/Lex/HeaderSearch.cpp
+++ b/clang/lib/Lex/HeaderSearch.cpp
@@ -378,20 +378,22 @@ Module *HeaderSearch::lookupModule(StringRef ModuleName, StringRef SearchName,
         break;
     }
 
-    // If we've already performed the exhaustive search for module maps in this
-    // search directory, don't do it again.
-    if (Dir.haveSearchedAllModuleMaps())
-      continue;
+    if (HSOpts->AllowModuleMapSubdirectorySearch) {
+      // If we've already performed the exhaustive search for module maps in
+      // this search directory, don't do it again.
+      if (Dir.haveSearchedAllModuleMaps())
+        continue;
 
-    // Load all module maps in the immediate subdirectories of this search
-    // directory if ModuleName was from @import.
-    if (AllowExtraModuleMapSearch)
-      loadSubdirectoryModuleMaps(Dir);
+      // Load all module maps in the immediate subdirectories of this search
+      // directory if ModuleName was from @import.
+      if (AllowExtraModuleMapSearch)
+        loadSubdirectoryModuleMaps(Dir);
 
-    // Look again for the module.
-    Module = ModMap.findModule(ModuleName);
-    if (Module)
-      break;
+      // Look again for the module.
+      Module = ModMap.findModule(ModuleName);
+      if (Module)
+        break;
+    }
   }
 
   return Module;
diff --git a/clang/test/CodeGenCXX/ptrauth-static-destructors.cpp b/clang/test/CodeGenCXX/ptrauth-static-destructors.cpp
index 1240f26..634450b 100644
--- a/clang/test/CodeGenCXX/ptrauth-static-destructors.cpp
+++ b/clang/test/CodeGenCXX/ptrauth-static-destructors.cpp
@@ -2,13 +2,27 @@
 // RUN:  | FileCheck %s --check-prefix=CXAATEXIT
 
 // RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -emit-llvm -std=c++11 %s -o - \
-// RUN:    -fno-use-cxa-atexit | FileCheck %s --check-prefixes=ATEXIT,DARWIN
+// RUN:    -fno-use-cxa-atexit | FileCheck %s --check-prefixes=ATEXIT,ATEXIT_DARWIN
 
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -emit-llvm -std=c++11 %s -o - \
 // RUN:  | FileCheck %s --check-prefix=CXAATEXIT
 
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -emit-llvm -std=c++11 %s -o - \
-// RUN:    -fno-use-cxa-atexit | FileCheck %s --check-prefixes=ATEXIT,ELF
+// RUN:    -fno-use-cxa-atexit | FileCheck %s --check-prefixes=ATEXIT,ATEXIT_ELF
+
+// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -emit-llvm -std=c++11 %s \
+// RUN:  -fptrauth-function-pointer-type-discrimination  -o - | FileCheck %s --check-prefix=CXAATEXIT_DISC
+
+// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -emit-llvm -std=c++11 %s -o - \
+// RUN:   -fptrauth-function-pointer-type-discrimination  -fno-use-cxa-atexit \
+// RUN:  | FileCheck %s --check-prefixes=ATEXIT_DISC,ATEXIT_DISC_DARWIN
+
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -emit-llvm -std=c++11 %s \
+// RUN:  -fptrauth-function-pointer-type-discrimination  -o - | FileCheck %s --check-prefix=CXAATEXIT_DISC
+
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -emit-llvm -std=c++11 %s -o - \
+// RUN:   -fptrauth-function-pointer-type-discrimination -fno-use-cxa-atexit \
+// RUN:  | FileCheck %s --check-prefixes=ATEXIT_DISC,ATEXIT_DISC_ELF
 
 class Foo {
  public:
@@ -21,11 +35,22 @@ Foo global;
 // CXAATEXIT: define internal void @__cxx_global_var_init()
 // CXAATEXIT:   call i32 @__cxa_atexit(ptr ptrauth (ptr @_ZN3FooD1Ev, i32 0), ptr @global, ptr @__dso_handle)
 
+// CXAATEXIT_DISC: define internal void @__cxx_global_var_init()
+// CXAATEXIT_DISC:   call i32 @__cxa_atexit(ptr ptrauth (ptr @_ZN3FooD1Ev, i32 0, i64 10942), ptr @global, ptr @__dso_handle)
 
 // ATEXIT: define internal void @__cxx_global_var_init()
 // ATEXIT:   %{{.*}} = call i32 @atexit(ptr ptrauth (ptr @__dtor_global, i32 0))
 
-// DARWIN: define internal void @__dtor_global() {{.*}} section "__TEXT,__StaticInit,regular,pure_instructions" {
-// ELF:    define internal void @__dtor_global() {{.*}} section ".text.startup" {
-// DARWIN:   %{{.*}} = call ptr @_ZN3FooD1Ev(ptr @global)
-// ELF:      call void @_ZN3FooD1Ev(ptr @global)
+// ATEXIT_DARWIN: define internal void @__dtor_global() {{.*}} section "__TEXT,__StaticInit,regular,pure_instructions" {
+// ATEXIT_ELF:    define internal void @__dtor_global() {{.*}} section ".text.startup" {
+// ATEXIT_DARWIN:   %{{.*}} = call ptr @_ZN3FooD1Ev(ptr @global)
+// ATEXIT_ELF:      call void @_ZN3FooD1Ev(ptr @global)
+
+// ATEXIT_DISC: define internal void @__cxx_global_var_init()
+// ATEXIT_DISC:   %{{.*}} = call i32 @atexit(ptr ptrauth (ptr @__dtor_global, i32 0, i64 10942))
+
+
+// ATEXIT_DISC_DARWIN: define internal void @__dtor_global() {{.*}} section "__TEXT,__StaticInit,regular,pure_instructions" {
+// ATEXIT_DISC_ELF:    define internal void @__dtor_global() {{.*}} section ".text.startup" {
+// ATEXIT_DISC_DARWIN:   %{{.*}} = call ptr @_ZN3FooD1Ev(ptr @global)
+// ATEXIT_DISC_ELF:      call void @_ZN3FooD1Ev(ptr @global)
diff --git a/clang/test/Driver/linker-wrapper-passes.c b/clang/test/Driver/linker-wrapper-passes.c
index aadcf47..fb63ef7 100644
--- a/clang/test/Driver/linker-wrapper-passes.c
+++ b/clang/test/Driver/linker-wrapper-passes.c
@@ -4,6 +4,9 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: amdgpu-registered-target
 
+// https://github.com/llvm/llvm-project/issues/100212
+// XFAIL: *
+
 // Setup.
 // RUN: mkdir -p %t
 // RUN: %clang -cc1 -emit-llvm-bc -o %t/host-x86_64-unknown-linux-gnu.bc \
@@ -13,7 +16,7 @@
 // RUN: opt %t/openmp-amdgcn-amd-amdhsa.bc -o %t/openmp-amdgcn-amd-amdhsa.bc \
 // RUN:     -passes=forceattrs -force-remove-attribute=f:noinline
 // RUN: clang-offload-packager -o %t/openmp-x86_64-unknown-linux-gnu.out \
-// RUN:     --image=file=%t/openmp-amdgcn-amd-amdhsa.bc,triple=amdgcn-amd-amdhsa
+// RUN:     --image=file=%t/openmp-amdgcn-amd-amdhsa.bc,arch=gfx90a,triple=amdgcn-amd-amdhsa
 // RUN: %clang -cc1 -S -o %t/host-x86_64-unknown-linux-gnu.s \
 // RUN:     -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa \
 // RUN:     -fembed-offload-object=%t/openmp-x86_64-unknown-linux-gnu.out \
diff --git a/clang/test/Driver/modulemap-allow-subdirectory-search.c b/clang/test/Driver/modulemap-allow-subdirectory-search.c
new file mode 100644
index 0000000..ee993a7
--- /dev/null
+++ b/clang/test/Driver/modulemap-allow-subdirectory-search.c
@@ -0,0 +1,27 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+
+// Check that with a sufficiently new SDK not searching for module maps in subdirectories.
+
+// New SDK.
+// RUN: %clang -target x86_64-apple-macos10.13 -isysroot %t/MacOSX15.0.sdk -fmodules %t/test.c -### 2>&1 \
+// RUN:   | FileCheck --check-prefix=NO-SUBDIRECTORIES %t/test.c
+// Old SDK.
+// RUN: %clang -target x86_64-apple-macos10.13 -isysroot %t/MacOSX14.0.sdk -fmodules %t/test.c -### 2>&1 \
+// RUN:   | FileCheck --check-prefix=SEARCH-SUBDIRECTORIES %t/test.c
+// Non-Darwin platform.
+// RUN: %clang -target i386-unknown-linux -isysroot %t/MacOSX15.0.sdk -fmodules %t/test.c -### 2>&1 \
+// RUN:   | FileCheck --check-prefix=SEARCH-SUBDIRECTORIES %t/test.c
+// New SDK overriding the default.
+// RUN: %clang -target x86_64-apple-macos10.13 -isysroot %t/MacOSX15.0.sdk -fmodules %t/test.c -fmodulemap-allow-subdirectory-search -### 2>&1 \
+// RUN:   | FileCheck --check-prefix=SEARCH-SUBDIRECTORIES %t/test.c
+
+//--- test.c
+// NO-SUBDIRECTORIES: "-fno-modulemap-allow-subdirectory-search"
+// SEARCH-SUBDIRECTORIES-NOT: "-fno-modulemap-allow-subdirectory-search"
+
+//--- MacOSX15.0.sdk/SDKSettings.json
+{"Version":"15.0", "MaximumDeploymentTarget": "15.0.99"}
+
+//--- MacOSX14.0.sdk/SDKSettings.json
+{"Version":"14.0", "MaximumDeploymentTarget": "14.0.99"}
diff --git a/clang/test/Driver/nvlink-wrapper.c b/clang/test/Driver/nvlink-wrapper.c
index fdda93f..318315d 100644
--- a/clang/test/Driver/nvlink-wrapper.c
+++ b/clang/test/Driver/nvlink-wrapper.c
@@ -63,3 +63,10 @@ int baz() { return y + x; }
 // RUN:   -arch sm_52 -o a.out 2>&1 | FileCheck %s --check-prefix=LTO
 // LTO: ptxas{{.*}} -m64 -c [[PTX:.+]].s -O3 -arch sm_52 -o [[CUBIN:.+]].cubin
 // LTO: nvlink{{.*}} -arch sm_52 -o a.out [[CUBIN]].cubin {{.*}}-u-{{.*}}.cubin {{.*}}-y-{{.*}}.cubin
+
+//
+// Check that we don't forward some arguments.
+//
+// RUN: clang-nvlink-wrapper --dry-run %t.o %t-u.o %t-y.a \
+// RUN:   -arch sm_52 --cuda-path/opt/cuda -o a.out 2>&1 | FileCheck %s --check-prefix=PATH
+// PATH-NOT: --cuda-path=/opt/cuda
diff --git a/clang/test/Headers/stddefneeds.cpp b/clang/test/Headers/stddefneeds.cpp
index 0763bbd..0282e8a 100644
--- a/clang/test/Headers/stddefneeds.cpp
+++ b/clang/test/Headers/stddefneeds.cpp
@@ -56,14 +56,21 @@ max_align_t m5;
 #undef NULL
 #define NULL 0
 
-// glibc (and other) headers then define __need_NULL and rely on stddef.h
-// to redefine NULL to the correct value again.
-#define __need_NULL
+// Including stddef.h again shouldn't redefine NULL
 #include <stddef.h>
 
 // gtk headers then use __attribute__((sentinel)), which doesn't work if NULL
 // is 0.
-void f(const char* c, ...) __attribute__((sentinel));
+void f(const char* c, ...) __attribute__((sentinel)); // expected-note{{function has been explicitly marked sentinel here}}
 void g() {
+  f("", NULL); // expected-warning{{missing sentinel in function call}}
+}
+
+// glibc (and other) headers then define __need_NULL and rely on stddef.h
+// to redefine NULL to the correct value again.
+#define __need_NULL
+#include <stddef.h>
+
+void h() {
   f("", NULL);  // Shouldn't warn.
 }
diff --git a/clang/test/Modules/modulemap-allow-subdirectory-search.m b/clang/test/Modules/modulemap-allow-subdirectory-search.m
new file mode 100644
index 0000000..ef6f9b1
--- /dev/null
+++ b/clang/test/Modules/modulemap-allow-subdirectory-search.m
@@ -0,0 +1,18 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+
+// RUN: %clang_cc1 -fsyntax-only -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/modules.cache -I %t/include %t/test.m
+// RUN: %clang_cc1 -fsyntax-only -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/modules.cache -I %t/include %t/test.m -fmodulemap-allow-subdirectory-search
+// RUN: not %clang_cc1 -fsyntax-only -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/modules.cache -I %t/include %t/test.m -fno-modulemap-allow-subdirectory-search
+
+//--- include/UnrelatedName/Header.h
+// empty
+
+//--- include/UnrelatedName/module.modulemap
+module UsefulCode {
+  header "Header.h"
+  export *
+}
+
+//--- test.m
+@import UsefulCode;
diff --git a/clang/test/Modules/stddef.cpp b/clang/test/Modules/stddef.cpp
new file mode 100644
index 0000000..c53bfa3
--- /dev/null
+++ b/clang/test/Modules/stddef.cpp
@@ -0,0 +1,73 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/no-lsv -I%t %t/stddef.cpp -verify
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-local-submodule-visibility -fmodules-cache-path=%t/lsv -I%t %t/stddef.cpp -verify
+
+//--- stddef.cpp
+#include <b.h>
+
+void *pointer = NULL;
+size_t size = 0;
+
+// When building with modules, a pcm is never re-imported, so re-including
+// stddef.h will not re-import _Builtin_stddef.null to restore the definition of
+// NULL, even though stddef.h will unconditionally include __stddef_null.h when
+// building with modules.
+#undef NULL
+#include <stddef.h>
+
+void *anotherPointer = NULL; // expected-error{{use of undeclared identifier 'NULL'}}
+
+// stddef.h needs to be a `textual` header to support clients doing things like
+// this.
+//
+// #define __need_NULL
+// #include <stddef.h>
+//
+// As a textual header designed to be included multiple times, it can't directly
+// declare anything, or those declarations would go into every module that
+// included it. e.g. if stddef.h contained all of its declarations, and modules
+// A and B included stddef.h, they would both have the declaration for size_t.
+// That breaks Swift, which uses the module name as part of the type name, i.e.
+// A.size_t and B.size_t are treated as completely different types in Swift and
+// cannot be interchanged. To fix that, stddef.h (and stdarg.h) are split out
+// into a separate file per __need macro that can be normal headers in explicit
+// submodules. That runs into yet another wrinkle though. When modules build,
+// declarations from previous submodules leak into subsequent ones when not
+// using local submodule visibility. Consider if stddef.h did the normal thing.
+//
+// #ifndef __STDDEF_H
+// #define __STDDEF_H
+// // include all of the sub-headers
+// #endif
+//
+// When SM builds without local submodule visibility, it will precompile a.h
+// first. When it gets to b.h, the __STDDEF_H declaration from precompiling a.h
+// will leak, and so when b.h includes stddef.h, it won't include any of its
+// sub-headers, and SM.B will thus not import _Builtin_stddef or make any of its
+// submodules visible. Precompiling b.h will be fine since it sees all of the
+// declarations from a.h including stddef.h, but clients that only include b.h
+// will not see any of the stddef.h types. stddef.h thus has to make sure to
+// always include the necessary sub-headers, even if they've been included
+// already. They all have their own header guards to allow this.
+// __stddef_null.h is extra special, so this test makes sure to cover NULL plus
+// one of the normal stddef.h types.
+
+//--- module.modulemap
+module SM {
+  module A {
+    header "a.h"
+    export *
+  }
+
+  module B {
+    header "b.h"
+    export *
+  }
+}
+
+//--- a.h
+#include <stddef.h>
+
+//--- b.h
+#include <stddef.h>
diff --git a/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td b/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td
index e84b530..8c80a51 100644
--- a/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td
+++ b/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td
@@ -12,9 +12,9 @@ def verbose : Flag<["-"], "v">, HelpText<"Print verbose information">;
 def version : Flag<["--"], "version">,
   HelpText<"Display the version number and exit">;
 
-def cuda_path_EQ : Joined<["--"], "cuda-path=">,
+def cuda_path_EQ : Joined<["--"], "cuda-path=">, Flags<[WrapperOnlyOption]>,
   MetaVarName<"<dir>">, HelpText<"Set the system CUDA path">;
-def ptxas_path_EQ : Joined<["--"], "ptxas-path=">,
+def ptxas_path_EQ : Joined<["--"], "ptxas-path=">, Flags<[WrapperOnlyOption]>,
   MetaVarName<"<dir>">, HelpText<"Set the 'ptxas' path">;
 
 def o : JoinedOrSeparate<["-"], "o">, MetaVarName<"<path>">,
diff --git a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
index 513e184..bdb5e23 100644
--- a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
+++ b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
@@ -653,12 +653,28 @@ TEST(MinimizeSourceToDependencyDirectivesTest, AtImport) {
 TEST(MinimizeSourceToDependencyDirectivesTest, EmptyIncludesAndImports) {
   SmallVector<char, 128> Out;
 
-  ASSERT_TRUE(minimizeSourceToDependencyDirectives("#import\n", Out));
-  ASSERT_TRUE(minimizeSourceToDependencyDirectives("#include\n", Out));
-  ASSERT_TRUE(minimizeSourceToDependencyDirectives("#ifdef A\n"
-                                                   "#import \n"
-                                                   "#endif\n",
-                                                   Out));
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#import\n", Out));
+  EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#include\n", Out));
+  EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#ifdef A\n"
+                                                    "#import \n"
+                                                    "#endif\n",
+                                                    Out));
+  // The ifdef block is removed because it's "empty".
+  EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#ifdef A\n"
+                                                    "#import \n"
+                                                    "#define B\n"
+                                                    "#endif\n",
+                                                    Out));
+  EXPECT_STREQ("#ifdef A\n"
+               "#define B\n"
+               "#endif\n",
+               Out.data());
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, AtImportFailures) {
diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h
index 9a8e53b..d850574 100644
--- a/compiler-rt/lib/scudo/standalone/secondary.h
+++ b/compiler-rt/lib/scudo/standalone/secondary.h
@@ -19,6 +19,7 @@
 #include "stats.h"
 #include "string_utils.h"
 #include "thread_annotations.h"
+#include "vector.h"
 
 namespace scudo {
 
@@ -73,12 +74,18 @@ static inline void unmap(LargeBlock::Header *H) {
 }
 
 namespace {
+
 struct CachedBlock {
+  static constexpr u16 CacheIndexMax = UINT16_MAX;
+  static constexpr u16 InvalidEntry = CacheIndexMax;
+
   uptr CommitBase = 0;
   uptr CommitSize = 0;
   uptr BlockBegin = 0;
   MemMapT MemMap = {};
   u64 Time = 0;
+  u16 Next = 0;
+  u16 Prev = 0;
 
   bool isValid() { return CommitBase != 0; }
 
@@ -188,10 +195,11 @@ public:
     Str->append("Stats: CacheRetrievalStats: SuccessRate: %u/%u "
                 "(%zu.%02zu%%)\n",
                 SuccessfulRetrieves, CallsToRetrieve, Integral, Fractional);
-    for (CachedBlock Entry : Entries) {
-      if (!Entry.isValid())
-        continue;
-      Str->append("StartBlockAddress: 0x%zx, EndBlockAddress: 0x%zx, "
+    Str->append("Cache Entry Info (Most Recent -> Least Recent):\n");
+
+    for (u32 I = LRUHead; I != CachedBlock::InvalidEntry; I = Entries[I].Next) {
+      CachedBlock &Entry = Entries[I];
+      Str->append("  StartBlockAddress: 0x%zx, EndBlockAddress: 0x%zx, "
                   "BlockSize: %zu %s\n",
                   Entry.CommitBase, Entry.CommitBase + Entry.CommitSize,
                   Entry.CommitSize, Entry.Time == 0 ? "[R]" : "");
@@ -202,6 +210,10 @@ public:
   static_assert(Config::getDefaultMaxEntriesCount() <=
                     Config::getEntriesArraySize(),
                 "");
+  // Ensure the cache entry array size fits in the LRU list Next and Prev
+  // index fields
+  static_assert(Config::getEntriesArraySize() <= CachedBlock::CacheIndexMax,
+                "Cache entry array is too large to be indexed.");
 
   void init(s32 ReleaseToOsInterval) NO_THREAD_SAFETY_ANALYSIS {
     DCHECK_EQ(EntriesCount, 0U);
@@ -213,23 +225,33 @@ public:
     if (Config::getDefaultReleaseToOsIntervalMs() != INT32_MIN)
       ReleaseToOsInterval = Config::getDefaultReleaseToOsIntervalMs();
     setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
+
+    // The cache is initially empty
+    LRUHead = CachedBlock::InvalidEntry;
+    LRUTail = CachedBlock::InvalidEntry;
+
+    // Available entries will be retrieved starting from the beginning of the
+    // Entries array
+    AvailableHead = 0;
+    for (u32 I = 0; I < Config::getEntriesArraySize() - 1; I++)
+      Entries[I].Next = static_cast<u16>(I + 1);
+
+    Entries[Config::getEntriesArraySize() - 1].Next = CachedBlock::InvalidEntry;
   }
 
   void store(const Options &Options, LargeBlock::Header *H) EXCLUDES(Mutex) {
     if (!canCache(H->CommitSize))
       return unmap(H);
 
-    bool EntryCached = false;
-    bool EmptyCache = false;
     const s32 Interval = atomic_load_relaxed(&ReleaseToOsIntervalMs);
-    const u64 Time = getMonotonicTimeFast();
-    const u32 MaxCount = atomic_load_relaxed(&MaxEntriesCount);
+    u64 Time;
     CachedBlock Entry;
+
     Entry.CommitBase = H->CommitBase;
     Entry.CommitSize = H->CommitSize;
     Entry.BlockBegin = reinterpret_cast<uptr>(H + 1);
     Entry.MemMap = H->MemMap;
-    Entry.Time = Time;
+    Entry.Time = UINT64_MAX;
     if (useMemoryTagging<Config>(Options)) {
       if (Interval == 0 && !SCUDO_FUCHSIA) {
         // Release the memory and make it inaccessible at the same time by
@@ -243,17 +265,32 @@ public:
         Entry.MemMap.setMemoryPermission(Entry.CommitBase, Entry.CommitSize,
                                          MAP_NOACCESS);
       }
-    } else if (Interval == 0) {
-      Entry.MemMap.releaseAndZeroPagesToOS(Entry.CommitBase, Entry.CommitSize);
-      Entry.Time = 0;
     }
+
+    // Usually only one entry will be evicted from the cache.
+    // Only in the rare event that the cache shrinks in real-time
+    // due to a decrease in the configurable value MaxEntriesCount
+    // will more than one cache entry be evicted.
+    // The vector is used to save the MemMaps of evicted entries so
+    // that the unmap call can be performed outside the lock
+    Vector<MemMapT, 1U> EvictionMemMaps;
+
     do {
       ScopedLock L(Mutex);
+
+      // Time must be computed under the lock to ensure
+      // that the LRU cache remains sorted with respect to
+      // time in a multithreaded environment
+      Time = getMonotonicTimeFast();
+      if (Entry.Time != 0)
+        Entry.Time = Time;
+
       if (useMemoryTagging<Config>(Options) && QuarantinePos == -1U) {
         // If we get here then memory tagging was disabled in between when we
         // read Options and when we locked Mutex. We can't insert our entry into
         // the quarantine or the cache because the permissions would be wrong so
         // just unmap it.
+        Entry.MemMap.unmap(Entry.MemMap.getBase(), Entry.MemMap.getCapacity());
         break;
       }
       if (Config::getQuarantineSize() && useMemoryTagging<Config>(Options)) {
@@ -269,36 +306,32 @@ public:
           OldestTime = Entry.Time;
         Entry = PrevEntry;
       }
-      if (EntriesCount >= MaxCount) {
-        if (IsFullEvents++ == 4U)
-          EmptyCache = true;
-      } else {
-        for (u32 I = 0; I < MaxCount; I++) {
-          if (Entries[I].isValid())
-            continue;
-          if (I != 0)
-            Entries[I] = Entries[0];
-          Entries[0] = Entry;
-          EntriesCount++;
-          if (OldestTime == 0)
-            OldestTime = Entry.Time;
-          EntryCached = true;
-          break;
-        }
+
+      // All excess entries are evicted from the cache
+      while (needToEvict()) {
+        // Save MemMaps of evicted entries to perform unmap outside of lock
+        EvictionMemMaps.push_back(Entries[LRUTail].MemMap);
+        remove(LRUTail);
       }
+
+      insert(Entry);
+
+      if (OldestTime == 0)
+        OldestTime = Entry.Time;
     } while (0);
-    if (EmptyCache)
-      empty();
-    else if (Interval >= 0)
+
+    for (MemMapT &EvictMemMap : EvictionMemMaps)
+      EvictMemMap.unmap(EvictMemMap.getBase(), EvictMemMap.getCapacity());
+
+    if (Interval >= 0) {
+      // TODO: Add ReleaseToOS logic to LRU algorithm
       releaseOlderThan(Time - static_cast<u64>(Interval) * 1000000);
-    if (!EntryCached)
-      Entry.MemMap.unmap(Entry.MemMap.getBase(), Entry.MemMap.getCapacity());
+    }
   }
 
   bool retrieve(Options Options, uptr Size, uptr Alignment, uptr HeadersSize,
                 LargeBlock::Header **H, bool *Zeroed) EXCLUDES(Mutex) {
     const uptr PageSize = getPageSizeCached();
-    const u32 MaxCount = atomic_load_relaxed(&MaxEntriesCount);
     // 10% of the requested size proved to be the optimal choice for
     // retrieving cached blocks after testing several options.
     constexpr u32 FragmentedBytesDivisor = 10;
@@ -312,9 +345,8 @@ public:
         return false;
       u32 OptimalFitIndex = 0;
       uptr MinDiff = UINTPTR_MAX;
-      for (u32 I = 0; I < MaxCount; I++) {
-        if (!Entries[I].isValid())
-          continue;
+      for (u32 I = LRUHead; I != CachedBlock::InvalidEntry;
+           I = Entries[I].Next) {
         const uptr CommitBase = Entries[I].CommitBase;
         const uptr CommitSize = Entries[I].CommitSize;
         const uptr AllocPos =
@@ -347,8 +379,7 @@ public:
       }
       if (Found) {
         Entry = Entries[OptimalFitIndex];
-        Entries[OptimalFitIndex].invalidate();
-        EntriesCount--;
+        remove(OptimalFitIndex);
         SuccessfulRetrieves++;
       }
     }
@@ -417,12 +448,9 @@ public:
         Quarantine[I].invalidate();
       }
     }
-    const u32 MaxCount = atomic_load_relaxed(&MaxEntriesCount);
-    for (u32 I = 0; I < MaxCount; I++) {
-      if (Entries[I].isValid()) {
-        Entries[I].MemMap.setMemoryPermission(Entries[I].CommitBase,
-                                              Entries[I].CommitSize, 0);
-      }
+    for (u32 I = LRUHead; I != CachedBlock::InvalidEntry; I = Entries[I].Next) {
+      Entries[I].MemMap.setMemoryPermission(Entries[I].CommitBase,
+                                            Entries[I].CommitSize, 0);
     }
     QuarantinePos = -1U;
   }
@@ -434,6 +462,66 @@ public:
   void unmapTestOnly() { empty(); }
 
 private:
+  bool needToEvict() REQUIRES(Mutex) {
+    return (EntriesCount >= atomic_load_relaxed(&MaxEntriesCount));
+  }
+
+  void insert(const CachedBlock &Entry) REQUIRES(Mutex) {
+    DCHECK_LT(EntriesCount, atomic_load_relaxed(&MaxEntriesCount));
+
+    // Cache should be populated with valid entries when not empty
+    DCHECK_NE(AvailableHead, CachedBlock::InvalidEntry);
+
+    u32 FreeIndex = AvailableHead;
+    AvailableHead = Entries[AvailableHead].Next;
+
+    if (EntriesCount == 0) {
+      LRUTail = static_cast<u16>(FreeIndex);
+    } else {
+      // Check list order
+      if (EntriesCount > 1)
+        DCHECK_GE(Entries[LRUHead].Time, Entries[Entries[LRUHead].Next].Time);
+      Entries[LRUHead].Prev = static_cast<u16>(FreeIndex);
+    }
+
+    Entries[FreeIndex] = Entry;
+    Entries[FreeIndex].Next = LRUHead;
+    Entries[FreeIndex].Prev = CachedBlock::InvalidEntry;
+    LRUHead = static_cast<u16>(FreeIndex);
+    EntriesCount++;
+
+    // Availability stack should not have available entries when all entries
+    // are in use
+    if (EntriesCount == Config::getEntriesArraySize())
+      DCHECK_EQ(AvailableHead, CachedBlock::InvalidEntry);
+  }
+
+  void remove(uptr I) REQUIRES(Mutex) {
+    DCHECK(Entries[I].isValid());
+
+    Entries[I].invalidate();
+
+    if (I == LRUHead)
+      LRUHead = Entries[I].Next;
+    else
+      Entries[Entries[I].Prev].Next = Entries[I].Next;
+
+    if (I == LRUTail)
+      LRUTail = Entries[I].Prev;
+    else
+      Entries[Entries[I].Next].Prev = Entries[I].Prev;
+
+    Entries[I].Next = AvailableHead;
+    AvailableHead = static_cast<u16>(I);
+    EntriesCount--;
+
+    // Cache should not have valid entries when not empty
+    if (EntriesCount == 0) {
+      DCHECK_EQ(LRUHead, CachedBlock::InvalidEntry);
+      DCHECK_EQ(LRUTail, CachedBlock::InvalidEntry);
+    }
+  }
+
   void empty() {
     MemMapT MapInfo[Config::getEntriesArraySize()];
     uptr N = 0;
@@ -443,11 +531,10 @@ private:
         if (!Entries[I].isValid())
           continue;
         MapInfo[N] = Entries[I].MemMap;
-        Entries[I].invalidate();
+        remove(I);
         N++;
       }
       EntriesCount = 0;
-      IsFullEvents = 0;
     }
     for (uptr I = 0; I < N; I++) {
       MemMapT &MemMap = MapInfo[I];
@@ -484,7 +571,6 @@ private:
   atomic_u32 MaxEntriesCount = {};
   atomic_uptr MaxEntrySize = {};
   u64 OldestTime GUARDED_BY(Mutex) = 0;
-  u32 IsFullEvents GUARDED_BY(Mutex) = 0;
   atomic_s32 ReleaseToOsIntervalMs = {};
   u32 CallsToRetrieve GUARDED_BY(Mutex) = 0;
   u32 SuccessfulRetrieves GUARDED_BY(Mutex) = 0;
@@ -492,6 +578,13 @@ private:
   CachedBlock Entries[Config::getEntriesArraySize()] GUARDED_BY(Mutex) = {};
   NonZeroLengthArray<CachedBlock, Config::getQuarantineSize()>
       Quarantine GUARDED_BY(Mutex) = {};
+
+  // The LRUHead of the cache is the most recently used cache entry
+  u16 LRUHead GUARDED_BY(Mutex) = 0;
+  // The LRUTail of the cache is the least recently used cache entry
+  u16 LRUTail GUARDED_BY(Mutex) = 0;
+  // The AvailableHead is the top of the stack of available entries
+  u16 AvailableHead GUARDED_BY(Mutex) = 0;
 };
 
 template <typename Config> class MapAllocator {
diff --git a/libc/cmake/modules/LibcConfig.cmake b/libc/cmake/modules/LibcConfig.cmake
index 7a3e606..da166dd 100644
--- a/libc/cmake/modules/LibcConfig.cmake
+++ b/libc/cmake/modules/LibcConfig.cmake
@@ -113,7 +113,7 @@ function(load_libc_config config_file)
       message(FATAL_ERROR ${json_error})
     endif()
     if(NOT DEFINED ${opt_name})
-      message(FATAL_ERROR: " Option ${opt_name} defined in ${config_file} is invalid.")
+      message(FATAL_ERROR " Option ${opt_name} defined in ${config_file} is invalid.")
     endif()
     if(ARGN)
       list(FIND ARGN ${opt_name} optname_exists)
diff --git a/libc/config/config.json b/libc/config/config.json
index 2005f42..2bf432e 100644
--- a/libc/config/config.json
+++ b/libc/config/config.json
@@ -1,8 +1,8 @@
 {
   "errno": {
     "LIBC_CONF_ERRNO_MODE": {
-      "value": "",
-      "doc": "The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, and LIBC_ERRNO_MODE_SYSTEM."
+      "value": "LIBC_ERRNO_MODE_DEFAULT",
+      "doc": "The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, and LIBC_ERRNO_MODE_SYSTEM."
     }
   },
   "printf": {
diff --git a/libc/src/errno/libc_errno.cpp b/libc/src/errno/libc_errno.cpp
index 7a17a5a..d1600d1 100644
--- a/libc/src/errno/libc_errno.cpp
+++ b/libc/src/errno/libc_errno.cpp
@@ -9,6 +9,8 @@
 #include "libc_errno.h"
 #include "src/__support/macros/config.h"
 
+// libc uses a fallback default value, either system or thread local.
+#define LIBC_ERRNO_MODE_DEFAULT 0
 // libc never stores a value; `errno` macro uses get link-time failure.
 #define LIBC_ERRNO_MODE_UNDEFINED 1
 // libc maintains per-thread state (requires C++ `thread_local` support).
@@ -23,7 +25,8 @@
 // fullbuild mode, effectively the same as `LIBC_ERRNO_MODE_EXTERNAL`.
 #define LIBC_ERRNO_MODE_SYSTEM 5
 
-#ifndef LIBC_ERRNO_MODE
+#if !defined(LIBC_ERRNO_MODE) || LIBC_ERRNO_MODE == LIBC_ERRNO_MODE_DEFAULT
+#undef LIBC_ERRNO_MODE
 #if defined(LIBC_FULL_BUILD) || !defined(LIBC_COPT_PUBLIC_PACKAGING)
 #define LIBC_ERRNO_MODE LIBC_ERRNO_MODE_THREAD_LOCAL
 #else
@@ -31,12 +34,14 @@
 #endif
 #endif // LIBC_ERRNO_MODE
 
-#if LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_UNDEFINED &&                            \
+#if LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_DEFAULT &&                              \
+    LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_UNDEFINED &&                            \
     LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_THREAD_LOCAL &&                         \
     LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SHARED &&                               \
     LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_EXTERNAL &&                             \
     LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM
 #error LIBC_ERRNO_MODE must be one of the following values: \
+LIBC_ERRNO_MODE_DEFAULT, \
 LIBC_ERRNO_MODE_UNDEFINED, \
 LIBC_ERRNO_MODE_THREAD_LOCAL, \
 LIBC_ERRNO_MODE_SHARED, \
diff --git a/libc/src/setjmp/riscv/longjmp.cpp b/libc/src/setjmp/riscv/longjmp.cpp
index b14f636..0f9537c 100644
--- a/libc/src/setjmp/riscv/longjmp.cpp
+++ b/libc/src/setjmp/riscv/longjmp.cpp
@@ -30,7 +30,6 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-[[gnu::naked]]
 LLVM_LIBC_FUNCTION(void, longjmp, (__jmp_buf * buf, int val)) {
   LOAD(ra, buf->__pc);
   LOAD(s0, buf->__regs[0]);
diff --git a/libc/src/setjmp/riscv/setjmp.cpp b/libc/src/setjmp/riscv/setjmp.cpp
index 92982cc..12def57 100644
--- a/libc/src/setjmp/riscv/setjmp.cpp
+++ b/libc/src/setjmp/riscv/setjmp.cpp
@@ -29,7 +29,6 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-[[gnu::naked]]
 LLVM_LIBC_FUNCTION(int, setjmp, (__jmp_buf * buf)) {
   STORE(ra, buf->__pc);
   STORE(s0, buf->__regs[0]);
diff --git a/libcxx/include/string_view b/libcxx/include/string_view
index 72dbf0b..2a03ee9 100644
--- a/libcxx/include/string_view
+++ b/libcxx/include/string_view
@@ -448,8 +448,11 @@ public:
   }
 
   _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI basic_string_view substr(size_type __pos = 0, size_type __n = npos) const {
+    // Use the `__assume_valid` form of the constructor to avoid an unnecessary check. Any substring of a view is a
+    // valid view. In particular, `size()` is known to be smaller than `numeric_limits<difference_type>::max()`, so the
+    // new size is also smaller. See also https://github.com/llvm/llvm-project/issues/91634.
     return __pos > size() ? (__throw_out_of_range("string_view::substr"), basic_string_view())
-                          : basic_string_view(data() + __pos, std::min(__n, size() - __pos));
+                          : basic_string_view(__assume_valid(), data() + __pos, std::min(__n, size() - __pos));
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX14 int compare(basic_string_view __sv) const _NOEXCEPT {
@@ -674,6 +677,16 @@ public:
 #endif
 
 private:
+  struct __assume_valid {};
+
+  // This is the same as the pointer and length constructor, but without the additional hardening checks. It is intended
+  // for use within the class, when the class invariants already guarantee the resulting object is valid. The compiler
+  // usually cannot eliminate the redundant checks because it does not know class invariants.
+  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI
+  basic_string_view(__assume_valid, const _CharT* __s, size_type __len) _NOEXCEPT
+      : __data_(__s),
+        __size_(__len) {}
+
   const value_type* __data_;
   size_type __size_;
 };
diff --git a/libcxx/test/libcxx/fuzzing/random.pass.cpp b/libcxx/test/libcxx/fuzzing/random.pass.cpp
index 6639776..af80fb8 100644
--- a/libcxx/test/libcxx/fuzzing/random.pass.cpp
+++ b/libcxx/test/libcxx/fuzzing/random.pass.cpp
@@ -8,7 +8,7 @@
 
 // This test fails because Clang no longer enables -fdelayed-template-parsing
 // by default on Windows with C++20 (#69431).
-// XFAIL: msvc && (clang-18 || clang-19)
+// XFAIL: msvc && (clang-18 || clang-19 || clang-20)
 
 // UNSUPPORTED: c++03, c++11
 
diff --git a/libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp b/libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp
index bbfb0c554..0f47a51 100644
--- a/libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp
+++ b/libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp
@@ -8,7 +8,7 @@
 
 // This test fails because Clang no longer enables -fdelayed-template-parsing
 // by default on Windows with C++20 (#69431).
-// XFAIL: msvc && (clang-18 || clang-19)
+// XFAIL: msvc && (clang-18 || clang-19 || clang-20)
 
 // <math.h>
 
diff --git a/libcxx/test/std/numerics/c.math/cmath.pass.cpp b/libcxx/test/std/numerics/c.math/cmath.pass.cpp
index 19b5fd0..6028aa5 100644
--- a/libcxx/test/std/numerics/c.math/cmath.pass.cpp
+++ b/libcxx/test/std/numerics/c.math/cmath.pass.cpp
@@ -8,7 +8,7 @@
 
 // This test fails because Clang no longer enables -fdelayed-template-parsing
 // by default on Windows with C++20 (#69431).
-// XFAIL: msvc && (clang-18 || clang-19)
+// XFAIL: msvc && (clang-18 || clang-19 || clang-20)
 
 // <cmath>
 
diff --git a/libcxx/utils/ci/Dockerfile b/libcxx/utils/ci/Dockerfile
index 9e1865e..490bee4 100644
--- a/libcxx/utils/ci/Dockerfile
+++ b/libcxx/utils/ci/Dockerfile
@@ -106,6 +106,7 @@ RUN sudo apt-get update \
 #RUN apt-get update && apt-get install -y ninja-build python3 python3-distutils python3-psutil git gdb ccache
 # TODO add ninja-build once 1.11 is available in Ubuntu, also remove the manual installation.
 RUN <<EOF
+  set -e
   wget -qO /tmp/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip
   gunzip /tmp/ninja.gz
   chmod a+x /tmp/ninja
@@ -115,6 +116,7 @@ EOF
 
 # These two locales are not enabled by default so generate them
 RUN <<EOF
+  set -e
   printf "fr_CA ISO-8859-1\ncs_CZ ISO-8859-2" | sudo tee -a /etc/locale.gen
   sudo mkdir /usr/local/share/i1en/
   printf "fr_CA ISO-8859-1\ncs_CZ ISO-8859-2" | sudo tee -a /usr/local/share/i1en/SUPPORTED
@@ -129,6 +131,7 @@ EOF
 # 14 release branch CI uses it. The tip-of-trunk CI will never use Clang 12,
 # though.
 RUN <<EOF
+  set -e
   sudo apt-get update
   wget https://apt.llvm.org/llvm.sh -O /tmp/llvm.sh
   chmod +x /tmp/llvm.sh
@@ -142,6 +145,7 @@ EOF
 
 # Install the most recent GCC, like clang install the previous version as a transition.
 RUN <<EOF
+  set -e
   sudo git clone https://github.com/compiler-explorer/infra.git /tmp/ce-infra
   (cd /tmp/ce-infra && sudo make ce)
   sudo /tmp/ce-infra/bin/ce_install install compilers/c++/x86/gcc $GCC_LATEST_VERSION.1.0
@@ -155,13 +159,14 @@ EOF
 
 RUN <<EOF
     # Install a recent CMake
+    set -e
     wget https://github.com/Kitware/CMake/releases/download/v3.21.1/cmake-3.21.1-linux-x86_64.sh -O /tmp/install-cmake.sh
     sudo bash /tmp/install-cmake.sh --prefix=/usr --exclude-subdir --skip-license
     rm /tmp/install-cmake.sh
 EOF
 
 # ===----------------------------------------------------------------------===##
-#                       Android Buildkite Image
+#                       Android Builder Base Image
 # ===----------------------------------------------------------------------===##
 
 FROM ubuntu:jammy AS android-builder-base
@@ -170,10 +175,11 @@ ARG ANDROID_CLANG_VERSION
 ARG ANDROID_CLANG_PREBUILTS_COMMIT
 ARG ANDROID_SYSROOT_BID
 
-RUN  apt-get update && apt-get install -y curl unzip git
+RUN apt-get update && apt-get install -y curl bzip2 git unzip
 
 # Install the Android platform tools (e.g. adb) into /opt/android/sdk.
 RUN <<EOF
+  set -e
   mkdir -p /opt/android/sdk
   cd /opt/android/sdk
   curl -LO https://dl.google.com/android/repository/platform-tools-latest-linux.zip
@@ -187,6 +193,7 @@ EOF
 ENV ANDROID_CLANG_VERSION=$ANDROID_CLANG_VERSION
 ENV ANDROID_CLANG_PREBUILTS_COMMIT=$ANDROID_CLANG_PREBUILTS_COMMIT
 RUN <<EOF
+    set -e
     git clone --filter=blob:none --sparse \
         https://android.googlesource.com/platform/prebuilts/clang/host/linux-x86 \
         /opt/android/clang
@@ -206,6 +213,7 @@ EOF
 
 ENV ANDROID_SYSROOT_BID=$ANDROID_SYSROOT_BID
 RUN <<EOF
+  set -e
   cd /opt/android
   curl -L -o ndk_platform.tar.bz2 \
       https://androidbuildinternal.googleapis.com/android/internal/build/v3/builds/${ANDROID_SYSROOT_BID}/ndk/attempts/latest/artifacts/ndk_platform.tar.bz2/url
@@ -213,19 +221,6 @@ RUN <<EOF
   rm ndk_platform.tar.bz2
 EOF
 
-# Install Docker
-RUN <<EOF
-  curl -fsSL https://get.docker.com -o /tmp/get-docker.sh
-  sh /tmp/get-docker.sh
-  rm /tmp/get-docker.sh
-
-  # Install Docker. Mark the binary setuid so it can be run without prefixing it
-  # with sudo. Adding the container user to the docker group doesn't work because
-  # /var/run/docker.sock is owned by the host's docker GID, not the container's
-  # docker GID.
-  chmod u+s /usr/bin/docker
-EOF
-
 # ===----------------------------------------------------------------------===##
 #                    Buildkite Builder Image
 # ===----------------------------------------------------------------------===##
@@ -243,6 +238,7 @@ WORKDIR /home/libcxx-builder
 # Install the Buildkite agent and dependencies. This must be done as non-root
 # for the Buildkite agent to be installed in a path where we can find it.
 RUN <<EOF
+  set -e
   cd /home/libcxx-builder
   curl -sL https://raw.githubusercontent.com/buildkite/agent/main/install.sh -o /tmp/install-agent.sh
   bash /tmp/install-agent.sh
@@ -271,6 +267,22 @@ COPY ./vendor/android/container-setup.sh /opt/android/container-setup.sh
 
 ENV PATH="/opt/android/sdk/platform-tools:${PATH}"
 
+USER root
+
+# Install Docker
+RUN <<EOF
+  set -e
+  curl -fsSL https://get.docker.com -o /tmp/get-docker.sh
+  sh /tmp/get-docker.sh
+  rm /tmp/get-docker.sh
+
+  # Install Docker. Mark the binary setuid so it can be run without prefixing it
+  # with sudo. Adding the container user to the docker group doesn't work because
+  # /var/run/docker.sock is owned by the host's docker GID, not the container's
+  # docker GID.
+  chmod u+s /usr/bin/docker
+EOF
+
 USER libcxx-builder
 WORKDIR /home/libcxx-builder
 
diff --git a/libcxx/utils/ci/vendor/android/run-buildbot-container b/libcxx/utils/ci/vendor/android/run-buildbot-container
index 4ab8319..7b5d9a4 100755
--- a/libcxx/utils/ci/vendor/android/run-buildbot-container
+++ b/libcxx/utils/ci/vendor/android/run-buildbot-container
@@ -27,5 +27,5 @@ if [ -S /var/run/docker.sock ]; then
     DOCKER_OPTIONS+=(--volume /var/run/docker.sock:/var/run/docker.sock)
 fi
 
-docker run "${DOCKER_OPTIONS[@]}" libcxx-builder-android \
+docker run "${DOCKER_OPTIONS[@]}" ghcr.io/libcxx/android-buildkite-builder \
     bash -c 'git config --global --add safe.directory /llvm; (/opt/android/container-setup.sh && exec bash)'
diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h
index 0c67206..6c04c92 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIR.h
+++ b/llvm/include/llvm/SandboxIR/SandboxIR.h
@@ -76,6 +76,7 @@ class Context;
 class Function;
 class Instruction;
 class SelectInst;
+class BranchInst;
 class LoadInst;
 class ReturnInst;
 class StoreInst;
@@ -179,6 +180,7 @@ protected:
   friend class User;       // For getting `Val`.
   friend class Use;        // For getting `Val`.
   friend class SelectInst; // For getting `Val`.
+  friend class BranchInst; // For getting `Val`.
   friend class LoadInst;   // For getting `Val`.
   friend class StoreInst;  // For getting `Val`.
   friend class ReturnInst; // For getting `Val`.
@@ -343,6 +345,14 @@ protected:
   virtual unsigned getUseOperandNo(const Use &Use) const = 0;
   friend unsigned Use::getOperandNo() const; // For getUseOperandNo()
 
+  void swapOperandsInternal(unsigned OpIdxA, unsigned OpIdxB) {
+    assert(OpIdxA < getNumOperands() && "OpIdxA out of bounds!");
+    assert(OpIdxB < getNumOperands() && "OpIdxB out of bounds!");
+    auto UseA = getOperandUse(OpIdxA);
+    auto UseB = getOperandUse(OpIdxB);
+    UseA.swap(UseB);
+  }
+
 #ifndef NDEBUG
   void verifyUserOfLLVMUse(const llvm::Use &Use) const;
 #endif // NDEBUG
@@ -504,6 +514,7 @@ protected:
   /// returns its topmost LLVM IR instruction.
   llvm::Instruction *getTopmostLLVMInstruction() const;
   friend class SelectInst; // For getTopmostLLVMInstruction().
+  friend class BranchInst; // For getTopmostLLVMInstruction().
   friend class LoadInst;   // For getTopmostLLVMInstruction().
   friend class StoreInst;  // For getTopmostLLVMInstruction().
   friend class ReturnInst; // For getTopmostLLVMInstruction().
@@ -617,6 +628,100 @@ public:
 #endif
 };
 
+class BranchInst : public Instruction {
+  /// Use Context::createBranchInst(). Don't call the constructor directly.
+  BranchInst(llvm::BranchInst *BI, Context &Ctx)
+      : Instruction(ClassID::Br, Opcode::Br, BI, Ctx) {}
+  friend Context; // for BranchInst()
+  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
+    return getOperandUseDefault(OpIdx, Verify);
+  }
+  SmallVector<llvm::Instruction *, 1> getLLVMInstrs() const final {
+    return {cast<llvm::Instruction>(Val)};
+  }
+
+public:
+  unsigned getUseOperandNo(const Use &Use) const final {
+    return getUseOperandNoDefault(Use);
+  }
+  unsigned getNumOfIRInstrs() const final { return 1u; }
+  static BranchInst *create(BasicBlock *IfTrue, Instruction *InsertBefore,
+                            Context &Ctx);
+  static BranchInst *create(BasicBlock *IfTrue, BasicBlock *InsertAtEnd,
+                            Context &Ctx);
+  static BranchInst *create(BasicBlock *IfTrue, BasicBlock *IfFalse,
+                            Value *Cond, Instruction *InsertBefore,
+                            Context &Ctx);
+  static BranchInst *create(BasicBlock *IfTrue, BasicBlock *IfFalse,
+                            Value *Cond, BasicBlock *InsertAtEnd, Context &Ctx);
+  /// For isa/dyn_cast.
+  static bool classof(const Value *From);
+  bool isUnconditional() const {
+    return cast<llvm::BranchInst>(Val)->isUnconditional();
+  }
+  bool isConditional() const {
+    return cast<llvm::BranchInst>(Val)->isConditional();
+  }
+  Value *getCondition() const;
+  void setCondition(Value *V) { setOperand(0, V); }
+  unsigned getNumSuccessors() const { return 1 + isConditional(); }
+  BasicBlock *getSuccessor(unsigned SuccIdx) const;
+  void setSuccessor(unsigned Idx, BasicBlock *NewSucc);
+  void swapSuccessors() { swapOperandsInternal(1, 2); }
+
+private:
+  struct LLVMBBToSBBB {
+    Context &Ctx;
+    LLVMBBToSBBB(Context &Ctx) : Ctx(Ctx) {}
+    BasicBlock *operator()(llvm::BasicBlock *BB) const;
+  };
+
+  struct ConstLLVMBBToSBBB {
+    Context &Ctx;
+    ConstLLVMBBToSBBB(Context &Ctx) : Ctx(Ctx) {}
+    const BasicBlock *operator()(const llvm::BasicBlock *BB) const;
+  };
+
+public:
+  using sb_succ_op_iterator =
+      mapped_iterator<llvm::BranchInst::succ_op_iterator, LLVMBBToSBBB>;
+  iterator_range<sb_succ_op_iterator> successors() {
+    iterator_range<llvm::BranchInst::succ_op_iterator> LLVMRange =
+        cast<llvm::BranchInst>(Val)->successors();
+    LLVMBBToSBBB BBMap(Ctx);
+    sb_succ_op_iterator MappedBegin = map_iterator(LLVMRange.begin(), BBMap);
+    sb_succ_op_iterator MappedEnd = map_iterator(LLVMRange.end(), BBMap);
+    return make_range(MappedBegin, MappedEnd);
+  }
+
+  using const_sb_succ_op_iterator =
+      mapped_iterator<llvm::BranchInst::const_succ_op_iterator,
+                      ConstLLVMBBToSBBB>;
+  iterator_range<const_sb_succ_op_iterator> successors() const {
+    iterator_range<llvm::BranchInst::const_succ_op_iterator> ConstLLVMRange =
+        static_cast<const llvm::BranchInst *>(cast<llvm::BranchInst>(Val))
+            ->successors();
+    ConstLLVMBBToSBBB ConstBBMap(Ctx);
+    const_sb_succ_op_iterator ConstMappedBegin =
+        map_iterator(ConstLLVMRange.begin(), ConstBBMap);
+    const_sb_succ_op_iterator ConstMappedEnd =
+        map_iterator(ConstLLVMRange.end(), ConstBBMap);
+    return make_range(ConstMappedBegin, ConstMappedEnd);
+  }
+
+#ifndef NDEBUG
+  void verify() const final {
+    assert(isa<llvm::BranchInst>(Val) && "Expected BranchInst!");
+  }
+  friend raw_ostream &operator<<(raw_ostream &OS, const BranchInst &BI) {
+    BI.dump(OS);
+    return OS;
+  }
+  void dump(raw_ostream &OS) const override;
+  LLVM_DUMP_METHOD void dump() const override;
+#endif
+};
+
 class LoadInst final : public Instruction {
   /// Use LoadInst::create() instead of calling the constructor.
   LoadInst(llvm::LoadInst *LI, Context &Ctx)
@@ -870,6 +975,8 @@ protected:
 
   SelectInst *createSelectInst(llvm::SelectInst *SI);
   friend SelectInst; // For createSelectInst()
+  BranchInst *createBranchInst(llvm::BranchInst *I);
+  friend BranchInst; // For createBranchInst()
   LoadInst *createLoadInst(llvm::LoadInst *LI);
   friend LoadInst; // For createLoadInst()
   StoreInst *createStoreInst(llvm::StoreInst *SI);
diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
index efa9155..f3d6167 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def
+++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
@@ -26,6 +26,7 @@ DEF_USER(Constant, Constant)
 //       ClassID, Opcode(s),  Class
 DEF_INSTR(Opaque, OP(Opaque), OpaqueInst)
 DEF_INSTR(Select, OP(Select), SelectInst)
+DEF_INSTR(Br, OP(Br), BranchInst)
 DEF_INSTR(Load, OP(Load), LoadInst)
 DEF_INSTR(Store, OP(Store), StoreInst)
 DEF_INSTR(Ret, OP(Ret), ReturnInst)
diff --git a/llvm/include/llvm/SandboxIR/Tracker.h b/llvm/include/llvm/SandboxIR/Tracker.h
index b88eb3d..3daec3f 100644
--- a/llvm/include/llvm/SandboxIR/Tracker.h
+++ b/llvm/include/llvm/SandboxIR/Tracker.h
@@ -101,6 +101,27 @@ public:
 #endif
 };
 
+/// Tracks swapping a Use with another Use.
+class UseSwap : public IRChangeBase {
+  Use ThisUse;
+  Use OtherUse;
+
+public:
+  UseSwap(const Use &ThisUse, const Use &OtherUse, Tracker &Tracker)
+      : IRChangeBase(Tracker), ThisUse(ThisUse), OtherUse(OtherUse) {
+    assert(ThisUse.getUser() == OtherUse.getUser() && "Expected same user!");
+  }
+  void revert() final { ThisUse.swap(OtherUse); }
+  void accept() final {}
+#ifndef NDEBUG
+  void dump(raw_ostream &OS) const final {
+    dumpCommon(OS);
+    OS << "UseSwap";
+  }
+  LLVM_DUMP_METHOD void dump() const final;
+#endif
+};
+
 class EraseFromParent : public IRChangeBase {
   /// Contains all the data we need to restore an "erased" (i.e., detached)
   /// instruction: the instruction itself and its operands in order.
diff --git a/llvm/include/llvm/SandboxIR/Use.h b/llvm/include/llvm/SandboxIR/Use.h
index d77b456..03cbfe6 100644
--- a/llvm/include/llvm/SandboxIR/Use.h
+++ b/llvm/include/llvm/SandboxIR/Use.h
@@ -47,6 +47,7 @@ public:
   void set(Value *V);
   class User *getUser() const { return Usr; }
   unsigned getOperandNo() const;
+  void swap(Use &OtherUse);
   Context *getContext() const { return Ctx; }
   bool operator==(const Use &Other) const {
     assert(Ctx == Other.Ctx && "Contexts differ!");
diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp
index 51c9af8..ceadb34 100644
--- a/llvm/lib/SandboxIR/SandboxIR.cpp
+++ b/llvm/lib/SandboxIR/SandboxIR.cpp
@@ -20,6 +20,13 @@ void Use::set(Value *V) { LLVMUse->set(V->Val); }
 
 unsigned Use::getOperandNo() const { return Usr->getUseOperandNo(*this); }
 
+void Use::swap(Use &OtherUse) {
+  auto &Tracker = Ctx->getTracker();
+  if (Tracker.isTracking())
+    Tracker.track(std::make_unique<UseSwap>(*this, OtherUse, Tracker));
+  LLVMUse->swap(*OtherUse.LLVMUse);
+}
+
 #ifndef NDEBUG
 void Use::dump(raw_ostream &OS) const {
   Value *Def = nullptr;
@@ -500,6 +507,85 @@ void SelectInst::dump() const {
 }
 #endif // NDEBUG
 
+BranchInst *BranchInst::create(BasicBlock *IfTrue, Instruction *InsertBefore,
+                               Context &Ctx) {
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  Builder.SetInsertPoint(cast<llvm::Instruction>(InsertBefore->Val));
+  llvm::BranchInst *NewBr =
+      Builder.CreateBr(cast<llvm::BasicBlock>(IfTrue->Val));
+  return Ctx.createBranchInst(NewBr);
+}
+
+BranchInst *BranchInst::create(BasicBlock *IfTrue, BasicBlock *InsertAtEnd,
+                               Context &Ctx) {
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  Builder.SetInsertPoint(cast<llvm::BasicBlock>(InsertAtEnd->Val));
+  llvm::BranchInst *NewBr =
+      Builder.CreateBr(cast<llvm::BasicBlock>(IfTrue->Val));
+  return Ctx.createBranchInst(NewBr);
+}
+
+BranchInst *BranchInst::create(BasicBlock *IfTrue, BasicBlock *IfFalse,
+                               Value *Cond, Instruction *InsertBefore,
+                               Context &Ctx) {
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  Builder.SetInsertPoint(cast<llvm::Instruction>(InsertBefore->Val));
+  llvm::BranchInst *NewBr =
+      Builder.CreateCondBr(Cond->Val, cast<llvm::BasicBlock>(IfTrue->Val),
+                           cast<llvm::BasicBlock>(IfFalse->Val));
+  return Ctx.createBranchInst(NewBr);
+}
+
+BranchInst *BranchInst::create(BasicBlock *IfTrue, BasicBlock *IfFalse,
+                               Value *Cond, BasicBlock *InsertAtEnd,
+                               Context &Ctx) {
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  Builder.SetInsertPoint(cast<llvm::BasicBlock>(InsertAtEnd->Val));
+  llvm::BranchInst *NewBr =
+      Builder.CreateCondBr(Cond->Val, cast<llvm::BasicBlock>(IfTrue->Val),
+                           cast<llvm::BasicBlock>(IfFalse->Val));
+  return Ctx.createBranchInst(NewBr);
+}
+
+bool BranchInst::classof(const Value *From) {
+  return From->getSubclassID() == ClassID::Br;
+}
+
+Value *BranchInst::getCondition() const {
+  assert(isConditional() && "Cannot get condition of an uncond branch!");
+  return Ctx.getValue(cast<llvm::BranchInst>(Val)->getCondition());
+}
+
+BasicBlock *BranchInst::getSuccessor(unsigned SuccIdx) const {
+  assert(SuccIdx < getNumSuccessors() &&
+         "Successor # out of range for Branch!");
+  return cast_or_null<BasicBlock>(
+      Ctx.getValue(cast<llvm::BranchInst>(Val)->getSuccessor(SuccIdx)));
+}
+
+void BranchInst::setSuccessor(unsigned Idx, BasicBlock *NewSucc) {
+  assert((Idx == 0 || Idx == 1) && "Out of bounds!");
+  setOperand(2u - Idx, NewSucc);
+}
+
+BasicBlock *BranchInst::LLVMBBToSBBB::operator()(llvm::BasicBlock *BB) const {
+  return cast<BasicBlock>(Ctx.getValue(BB));
+}
+const BasicBlock *
+BranchInst::ConstLLVMBBToSBBB::operator()(const llvm::BasicBlock *BB) const {
+  return cast<BasicBlock>(Ctx.getValue(BB));
+}
+#ifndef NDEBUG
+void BranchInst::dump(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+void BranchInst::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
 LoadInst *LoadInst::create(Type *Ty, Value *Ptr, MaybeAlign Align,
                            Instruction *InsertBefore, Context &Ctx,
                            const Twine &Name) {
@@ -758,6 +844,11 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
     It->second = std::unique_ptr<SelectInst>(new SelectInst(LLVMSel, *this));
     return It->second.get();
   }
+  case llvm::Instruction::Br: {
+    auto *LLVMBr = cast<llvm::BranchInst>(LLVMV);
+    It->second = std::unique_ptr<BranchInst>(new BranchInst(LLVMBr, *this));
+    return It->second.get();
+  }
   case llvm::Instruction::Load: {
     auto *LLVMLd = cast<llvm::LoadInst>(LLVMV);
     It->second = std::unique_ptr<LoadInst>(new LoadInst(LLVMLd, *this));
@@ -796,6 +887,11 @@ SelectInst *Context::createSelectInst(llvm::SelectInst *SI) {
   return cast<SelectInst>(registerValue(std::move(NewPtr)));
 }
 
+BranchInst *Context::createBranchInst(llvm::BranchInst *BI) {
+  auto NewPtr = std::unique_ptr<BranchInst>(new BranchInst(BI, *this));
+  return cast<BranchInst>(registerValue(std::move(NewPtr)));
+}
+
 LoadInst *Context::createLoadInst(llvm::LoadInst *LI) {
   auto NewPtr = std::unique_ptr<LoadInst>(new LoadInst(LI, *this));
   return cast<LoadInst>(registerValue(std::move(NewPtr)));
diff --git a/llvm/lib/SandboxIR/Tracker.cpp b/llvm/lib/SandboxIR/Tracker.cpp
index 626c9c2..c741776 100644
--- a/llvm/lib/SandboxIR/Tracker.cpp
+++ b/llvm/lib/SandboxIR/Tracker.cpp
@@ -35,6 +35,11 @@ void UseSet::dump() const {
   dump(dbgs());
   dbgs() << "\n";
 }
+
+void UseSwap::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
 #endif // NDEBUG
 
 Tracker::~Tracker() {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 3d37eb2..bb36ce7 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -78,7 +78,7 @@ def HasSignExt :
 
 def HasSIMD128 :
     Predicate<"Subtarget->hasSIMD128()">,
-    AssemblerPredicate<(all_of FeatureSIMD128), "simd128">;
+    AssemblerPredicate<(any_of FeatureSIMD128, FeatureRelaxedSIMD), "simd128">;
 
 def HasTailCall :
     Predicate<"Subtarget->hasTailCall()">,
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 66bd786..64da3dfd 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -242,9 +242,16 @@ public:
     // recursion.
     bool Recursive = false;
 
-    // The corresponding allocation or interior call.
+    // The corresponding allocation or interior call. This is the primary call
+    // for which we have created this node.
     CallInfo Call;
 
+    // List of other calls that can be treated the same as the primary call
+    // through cloning. I.e. located in the same function and have the same
+    // (possibly pruned) stack ids. They will be updated the same way as the
+    // primary call when assigning to function clones.
+    std::vector<CallInfo> MatchingCalls;
+
     // For alloc nodes this is a unique id assigned when constructed, and for
     // callsite stack nodes it is the original stack id when the node is
     // constructed from the memprof MIB metadata on the alloc nodes. Note that
@@ -457,6 +464,9 @@ protected:
   /// iteration.
   MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
 
+  /// Records the function each call is located in.
+  DenseMap<CallInfo, const FuncTy *> CallToFunc;
+
   /// Map from callsite node to the enclosing caller function.
   std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
 
@@ -474,7 +484,8 @@ private:
   /// StackIdToMatchingCalls map.
   void assignStackNodesPostOrder(
       ContextNode *Node, DenseSet<const ContextNode *> &Visited,
-      DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls);
+      DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
+      DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
 
   /// Duplicates the given set of context ids, updating the provided
   /// map from each original id with the newly generated context ids,
@@ -1230,10 +1241,11 @@ static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
 
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
-    assignStackNodesPostOrder(ContextNode *Node,
-                              DenseSet<const ContextNode *> &Visited,
-                              DenseMap<uint64_t, std::vector<CallContextInfo>>
-                                  &StackIdToMatchingCalls) {
+    assignStackNodesPostOrder(
+        ContextNode *Node, DenseSet<const ContextNode *> &Visited,
+        DenseMap<uint64_t, std::vector<CallContextInfo>>
+            &StackIdToMatchingCalls,
+        DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
   auto Inserted = Visited.insert(Node);
   if (!Inserted.second)
     return;
@@ -1246,7 +1258,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
     // Skip any that have been removed during the recursion.
     if (!Edge)
       continue;
-    assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls);
+    assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
+                              CallToMatchingCall);
   }
 
   // If this node's stack id is in the map, update the graph to contain new
@@ -1289,8 +1302,19 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
     auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
     // Skip any for which we didn't assign any ids, these don't get a node in
     // the graph.
-    if (SavedContextIds.empty())
+    if (SavedContextIds.empty()) {
+      // If this call has a matching call (located in the same function and
+      // having the same stack ids), simply add it to the context node created
+      // for its matching call earlier. These can be treated the same through
+      // cloning and get updated at the same time.
+      if (!CallToMatchingCall.contains(Call))
+        continue;
+      auto MatchingCall = CallToMatchingCall[Call];
+      assert(NonAllocationCallToContextNodeMap.contains(MatchingCall));
+      NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
+          Call);
       continue;
+    }
 
     assert(LastId == Ids.back());
 
@@ -1422,6 +1446,10 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
   // there is more than one call with the same stack ids. Their (possibly newly
   // duplicated) context ids are saved in the StackIdToMatchingCalls map.
   DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
+  // Save a map from each call to any that are found to match it. I.e. located
+  // in the same function and have the same (possibly pruned) stack ids. We use
+  // this to avoid creating extra graph nodes as they can be treated the same.
+  DenseMap<CallInfo, CallInfo> CallToMatchingCall;
   for (auto &It : StackIdToMatchingCalls) {
     auto &Calls = It.getSecond();
     // Skip single calls with a single stack id. These don't need a new node.
@@ -1460,6 +1488,13 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
     DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
     assert(!LastNodeContextIds.empty());
 
+    // Map from function to the first call from the below list (with matching
+    // stack ids) found in that function. Note that calls from different
+    // functions can have the same stack ids because this is the list of stack
+    // ids that had (possibly pruned) nodes after building the graph from the
+    // allocation MIBs.
+    DenseMap<const FuncTy *, CallInfo> FuncToCallMap;
+
     for (unsigned I = 0; I < Calls.size(); I++) {
       auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
       assert(SavedContextIds.empty());
@@ -1533,6 +1568,18 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
           continue;
       }
 
+      const FuncTy *CallFunc = CallToFunc[Call];
+
+      // If the prior call had the same stack ids this map would not be empty.
+      // Check if we already have a call that "matches" because it is located
+      // in the same function.
+      if (FuncToCallMap.contains(CallFunc)) {
+        // Record the matching call found for this call, and skip it. We
+        // will subsequently combine it into the same node.
+        CallToMatchingCall[Call] = FuncToCallMap[CallFunc];
+        continue;
+      }
+
       // Check if the next set of stack ids is the same (since the Calls vector
       // of tuples is sorted by the stack ids we can just look at the next one).
       bool DuplicateContextIds = false;
@@ -1562,7 +1609,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
         set_subtract(LastNodeContextIds, StackSequenceContextIds);
         if (LastNodeContextIds.empty())
           break;
-      }
+        // No longer possibly in a sequence of calls with duplicate stack ids,
+        // clear the map.
+        FuncToCallMap.clear();
+      } else
+        // Record the call with its function, so we can locate it the next time
+        // we find a call from this function when processing the calls with the
+        // same stack ids.
+        FuncToCallMap[CallFunc] = Call;
     }
   }
 
@@ -1579,7 +1633,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
   // associated context ids over to the new nodes.
   DenseSet<const ContextNode *> Visited;
   for (auto &Entry : AllocationCallToContextNodeMap)
-    assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls);
+    assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
+                              CallToMatchingCall);
   if (VerifyCCG)
     check();
 }
@@ -1679,6 +1734,7 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
           continue;
         if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
           CallsWithMetadata.push_back(&I);
+          CallToFunc[&I] = &F;
           auto *AllocNode = addAllocNode(&I, &F);
           auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
           assert(CallsiteMD);
@@ -1700,8 +1756,10 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
           I.setMetadata(LLVMContext::MD_callsite, nullptr);
         }
         // For callsite metadata, add to list for this function for later use.
-        else if (I.getMetadata(LLVMContext::MD_callsite))
+        else if (I.getMetadata(LLVMContext::MD_callsite)) {
           CallsWithMetadata.push_back(&I);
+          CallToFunc[&I] = &F;
+        }
       }
     }
     if (!CallsWithMetadata.empty())
@@ -1756,8 +1814,10 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
           // correlate properly in applyImport in the backends.
           if (AN.MIBs.empty())
             continue;
-          CallsWithMetadata.push_back({&AN});
-          auto *AllocNode = addAllocNode({&AN}, FS);
+          IndexCall AllocCall(&AN);
+          CallsWithMetadata.push_back(AllocCall);
+          CallToFunc[AllocCall] = FS;
+          auto *AllocNode = addAllocNode(AllocCall, FS);
           // Pass an empty CallStack to the CallsiteContext (second)
           // parameter, since for ThinLTO we already collapsed out the inlined
           // stack ids on the allocation call during ModuleSummaryAnalysis.
@@ -1788,8 +1848,11 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
       }
       // For callsite metadata, add to list for this function for later use.
       if (!FS->callsites().empty())
-        for (auto &SN : FS->mutableCallsites())
-          CallsWithMetadata.push_back({&SN});
+        for (auto &SN : FS->mutableCallsites()) {
+          IndexCall StackNodeCall(&SN);
+          CallsWithMetadata.push_back(StackNodeCall);
+          CallToFunc[StackNodeCall] = FS;
+        }
 
       if (!CallsWithMetadata.empty())
         FuncToCallsWithMetadata[FS] = CallsWithMetadata;
@@ -2225,6 +2288,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
   if (Recursive)
     OS << " (recursive)";
   OS << "\n";
+  if (!MatchingCalls.empty()) {
+    OS << "\tMatchingCalls:\n";
+    for (auto &MatchingCall : MatchingCalls) {
+      OS << "\t";
+      MatchingCall.print(OS);
+      OS << "\n";
+    }
+  }
   OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
   OS << "\tContextIds:";
   // Make a copy of the computed context ids that we can sort for stability.
@@ -2478,6 +2549,7 @@ CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
       std::make_unique<ContextNode>(Node->IsAllocation, Node->Call));
   ContextNode *Clone = NodeOwner.back().get();
   Node->addClone(Clone);
+  Clone->MatchingCalls = Node->MatchingCalls;
   assert(NodeToCallingFunc.count(Node));
   NodeToCallingFunc[Clone] = NodeToCallingFunc[Node];
   moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true,
@@ -3021,6 +3093,14 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
         if (CallMap.count(Call))
           CallClone = CallMap[Call];
         CallsiteClone->setCall(CallClone);
+        // Need to do the same for all matching calls.
+        for (auto &MatchingCall : Node->MatchingCalls) {
+          CallInfo CallClone(MatchingCall);
+          if (CallMap.count(MatchingCall))
+            CallClone = CallMap[MatchingCall];
+          // Updates the call in the list.
+          MatchingCall = CallClone;
+        }
       };
 
       // Keep track of the clones of callsite Node that need to be assigned to
@@ -3187,6 +3267,16 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
               CallInfo NewCall(CallMap[OrigCall]);
               assert(NewCall);
               NewClone->setCall(NewCall);
+              // Need to do the same for all matching calls.
+              for (auto &MatchingCall : NewClone->MatchingCalls) {
+                CallInfo OrigMatchingCall(MatchingCall);
+                OrigMatchingCall.setCloneNo(0);
+                assert(CallMap.count(OrigMatchingCall));
+                CallInfo NewCall(CallMap[OrigMatchingCall]);
+                assert(NewCall);
+                // Updates the call in the list.
+                MatchingCall = NewCall;
+              }
             }
           }
           // Fall through to handling below to perform the recording of the
@@ -3373,6 +3463,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
 
     if (Node->IsAllocation) {
       updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
+      assert(Node->MatchingCalls.empty());
       return;
     }
 
@@ -3381,6 +3472,9 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
 
     auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
     updateCall(Node->Call, CalleeFunc);
+    // Update all the matching calls as well.
+    for (auto &Call : Node->MatchingCalls)
+      updateCall(Call, CalleeFunc);
   };
 
   // Performs DFS traversal starting from allocation nodes to update calls to
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1b787d0..2d6d67a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -283,12 +283,12 @@ static Instruction *getInstructionForCost(const VPRecipeBase *R) {
 }
 
 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
-  if (auto *UI = getInstructionForCost(this))
-    if (Ctx.skipCostComputation(UI, VF.isVector()))
-      return 0;
+  auto *UI = getInstructionForCost(this);
+  if (UI && Ctx.skipCostComputation(UI, VF.isVector()))
+    return 0;
 
   InstructionCost RecipeCost = computeCost(VF, Ctx);
-  if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
+  if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 &&
       RecipeCost.isValid())
     RecipeCost = InstructionCost(ForceTargetInstructionCost);
 
diff --git a/llvm/test/CodeGen/WebAssembly/simd-asm-pred.ll b/llvm/test/CodeGen/WebAssembly/simd-asm-pred.ll
new file mode 100644
index 0000000..f022c3e
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-asm-pred.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -verify-machineinstrs -mattr=+relaxed-simd | FileCheck %s
+
+; Test that setting "relaxed-simd" target feature set also implies 'simd128' in
+; AssemblerPredicate, which is used to verify instructions in AsmPrinter.
+
+target triple = "wasm32-unknown-unknown"
+
+declare <2 x i64> @llvm.wasm.relaxed.laneselect.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
+
+; The compiled result of this function uses LOCAL_GET_V128, which is predicated
+; on the 'simd128' feature. We should be able to compile this when only
+; 'relaxed-simd' is set, which implies 'simd128'.
+define <2 x i64> @test(<2 x i64>, <2 x i64>, <2 x i64>) #0 {
+; CHECK-LABEL: test:
+; CHECK:         .functype  test (v128, v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get  0
+; CHECK-NEXT:    local.get  1
+; CHECK-NEXT:    local.get  2
+; CHECK-NEXT:    i64x2.relaxed_laneselect
+start:
+  %_4 = tail call <2 x i64> @llvm.wasm.relaxed.laneselect.v2i64(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) #3
+  ret <2 x i64> %_4
+}
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_float.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_float.ll
new file mode 100644
index 0000000..54b5705
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_float.ll
@@ -0,0 +1,307 @@
+; No assertions yet because the test case crashes MSan
+;
+; Test memory sanitizer instrumentation for Arm NEON VST_{2,3,4} and
+; VST_1x{2,3,4} instructions, including floating-point parameters.
+;
+; RUN: opt < %s -passes=msan -S | FileCheck %s
+;
+; UNSUPPORTED: {{.*}}
+;
+; Generated with:
+;     grep call clang/test/CodeGen/aarch64-neon-intrinsics.c \
+;         |  grep 'neon[.]st'                                \
+;         | sed -r 's/^\/\/ CHECK:[ ]*//'                    \
+;         | cut -d ' ' -f 1 --complement                     \
+;         | sed -r 's/[[][[]TMP[0-9]+[]][]]/%A/'             \
+;         | sed -r 's/[[][[]TMP[0-9]+[]][]]/%B/'             \
+;         | sed -r 's/[[][[]TMP[0-9]+[]][]]/%C/'             \
+;         | sed -r 's/[[][[]TMP[0-9]+[]][]]/%D/'             \
+;         | sort                                             \
+;         | uniq                                             \
+;         | while read x;                                    \
+;             do                                             \
+;                 y=`echo "$x"                               \
+;                     | sed -r 's/@llvm[.]aarch64[.]neon[.]/@/' \
+;                     | sed -r 's/[.]p0//'                      \
+;                     | tr '.' '_'`;                            \
+;                 echo "define $y sanitize_memory {"; \
+;                 echo "  call $x";                   \
+;                 echo "  ret void";                  \
+;                 echo "}";                           \
+;                 echo;                               \
+;             done
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android9001"
+
+; -----------------------------------------------------------------------------------------------------------------------------------------------
+
+define void @st1x2_v1f64(<1 x double> %A, <1 x double> %B, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st1x2.v1f64.p0(<1 x double> %A, <1 x double> %B, ptr %a)
+  ret void
+}
+
+define void @st1x2_v1i64(<1 x i64> %A, <1 x i64> %B, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> %A, <1 x i64> %B, ptr %a)
+  ret void
+}
+
+define void @st1x2_v2f64(<2 x double> %A, <2 x double> %B, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st1x2.v2f64.p0(<2 x double> %A, <2 x double> %B, ptr %a)
+  ret void
+}
+
+define void @st1x2_v2i64(<2 x i64> %A, <2 x i64> %B, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> %A, <2 x i64> %B, ptr %a)
+  ret void
+}
+
+define void @st1x3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st1x3.v1f64.p0(<1 x double> %A, <1 x double> %B, <1 x double> %C, ptr %a)
+  ret void
+}
+
+define void @st1x3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %a)
+  ret void
+}
+
+define void @st1x3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st1x3.v2f64.p0(<2 x double> %A, <2 x double> %B, <2 x double> %C, ptr %a)
+  ret void
+}
+
+define void @st1x3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %a)
+  ret void
+}
+
+define void @st1x4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st1x4.v1f64.p0(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, ptr %a)
+  ret void
+}
+
+define void @st1x4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr %a)
+  ret void
+}
+
+define void @st1x4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st1x4.v2f64.p0(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, ptr %a)
+  ret void
+}
+
+define void @st1x4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %a)
+  ret void
+}
+
+define void @st2_v16i8(<16 x i8> %A, <16 x i8> %B, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> %A, <16 x i8> %B, ptr %a)
+  ret void
+}
+
+define void @st2_v1f64(<1 x double> %A, <1 x double> %B, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st2.v1f64.p0(<1 x double> %A, <1 x double> %B, ptr %a)
+  ret void
+}
+
+define void @st2_v1i64(<1 x i64> %A, <1 x i64> %B, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> %A, <1 x i64> %B, ptr %a)
+  ret void
+}
+
+define void @st2_v2f32(<2 x float> %A, <2 x float> %B, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st2.v2f32.p0(<2 x float> %A, <2 x float> %B, ptr %a)
+  ret void
+}
+
+define void @st2_v2f64(<2 x double> %A, <2 x double> %B, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st2.v2f64.p0(<2 x double> %A, <2 x double> %B, ptr %a)
+  ret void
+}
+
+define void @st2_v2i32(<2 x i32> %A, <2 x i32> %B, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> %A, <2 x i32> %B, ptr %a)
+  ret void
+}
+
+define void @st2_v2i64(<2 x i64> %A, <2 x i64> %B, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> %A, <2 x i64> %B, ptr %a)
+  ret void
+}
+
+define void @st2_v4f16(<4 x half> %A, <4 x half> %B, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st2.v4f16.p0(<4 x half> %A, <4 x half> %B, ptr %a)
+  ret void
+}
+
+define void @st2_v4f32(<4 x float> %A, <4 x float> %B, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> %A, <4 x float> %B, ptr %a)
+  ret void
+}
+
+define void @st2_v4i16(<4 x i16> %A, <4 x i16> %B, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> %A, <4 x i16> %B, ptr %a)
+  ret void
+}
+
+define void @st2_v4i32(<4 x i32> %A, <4 x i32> %B, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %A, <4 x i32> %B, ptr %a)
+  ret void
+}
+
+define void @st2_v8f16(<8 x half> %A, <8 x half> %B, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st2.v8f16.p0(<8 x half> %A, <8 x half> %B, ptr %a)
+  ret void
+}
+
+define void @st2_v8i16(<8 x i16> %A, <8 x i16> %B, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> %A, <8 x i16> %B, ptr %a)
+  ret void
+}
+
+define void @st2_v8i8(<8 x i8> %A, <8 x i8> %B, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> %A, <8 x i8> %B, ptr %a)
+  ret void
+}
+
+define void @st3_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %a)
+  ret void
+}
+
+define void @st3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st3.v1f64.p0(<1 x double> %A, <1 x double> %B, <1 x double> %C, ptr %a)
+  ret void
+}
+
+define void @st3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %a)
+  ret void
+}
+
+define void @st3_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st3.v2f32.p0(<2 x float> %A, <2 x float> %B, <2 x float> %C, ptr %a)
+  ret void
+}
+
+define void @st3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st3.v2f64.p0(<2 x double> %A, <2 x double> %B, <2 x double> %C, ptr %a)
+  ret void
+}
+
+define void @st3_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, ptr %a)
+  ret void
+}
+
+define void @st3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %a)
+  ret void
+}
+
+define void @st3_v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st3.v4f16.p0(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr %a)
+  ret void
+}
+
+define void @st3_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st3.v4f32.p0(<4 x float> %A, <4 x float> %B, <4 x float> %C, ptr %a)
+  ret void
+}
+
+define void @st3_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr %a)
+  ret void
+}
+
+define void @st3_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %a)
+  ret void
+}
+
+define void @st3_v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st3.v8f16.p0(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr %a)
+  ret void
+}
+
+define void @st3_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %a)
+  ret void
+}
+
+define void @st3_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %a)
+  ret void
+}
+
+define void @st4_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr %a)
+  ret void
+}
+
+define void @st4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st4.v1f64.p0(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, ptr %a)
+  ret void
+}
+
+define void @st4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr %a)
+  ret void
+}
+
+define void @st4_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st4.v2f32.p0(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, ptr %a)
+  ret void
+}
+
+define void @st4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st4.v2f64.p0(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, ptr %a)
+  ret void
+}
+
+define void @st4_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, ptr %a)
+  ret void
+}
+
+define void @st4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %a)
+  ret void
+}
+
+define void @st4_v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> %D, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st4.v4f16.p0(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> %D, ptr %a)
+  ret void
+}
+
+define void @st4_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st4.v4f32.p0(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, ptr %a)
+  ret void
+}
+
+define void @st4_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, ptr %a)
+  ret void
+}
+
+define void @st4_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, ptr %a)
+  ret void
+}
+
+define void @st4_v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> %D, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st4.v8f16.p0(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> %D, ptr %a)
+  ret void
+}
+
+define void @st4_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, ptr %a)
+  ret void
+}
+
+define void @st4_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %a) sanitize_memory {
+  call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %a)
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/force-target-instruction-cost.ll
new file mode 100644
index 0000000..3477c8d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/force-target-instruction-cost.ll
@@ -0,0 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -mcpu=z16 -force-target-instruction-cost=1 -S %s | FileCheck %s
+
+target triple = "systemz-unknown-linux-unknown"
+
+define void @test_scalar_steps_target_instruction_cost(ptr %dst) {
+; CHECK-LABEL: define void @test_scalar_steps_target_instruction_cost(
+; CHECK-SAME: ptr [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE2:.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 3
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[INDEX]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1>
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ule <2 x i64> [[VEC_IV]], <i64 8, i64 8>
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK:       [[PRED_STORE_IF]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP2]], ptr [[TMP3]], align 8
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; CHECK:       [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]]
+; CHECK:       [[PRED_STORE_IF1]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP5]]
+; CHECK-NEXT:    store i64 [[TMP5]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; CHECK:       [[PRED_STORE_CONTINUE2]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 30, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i64 [[IV]], ptr [[GEP]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV]], 22
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr inbounds i64, ptr %dst, i64 %iv
+  store i64 %iv, ptr %gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 3
+  %cmp = icmp ult i64 %iv, 22
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index ba90b4f..c600103 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -398,7 +398,7 @@ bb1:
     EXPECT_EQ(Buff, R"IR(
 void @foo(i32 %arg0, i32 %arg1) {
 bb0:
-  br label %bb1 ; SB3. (Opaque)
+  br label %bb1 ; SB3. (Br)
 
 bb1:
   ret void ; SB5. (Ret)
@@ -466,7 +466,7 @@ bb1:
     BB0.dump(BS);
     EXPECT_EQ(Buff, R"IR(
 bb0:
-  br label %bb1 ; SB2. (Opaque)
+  br label %bb1 ; SB2. (Br)
 )IR");
   }
 #endif // NDEBUG
@@ -629,6 +629,111 @@ define void @foo(i1 %c0, i8 %v0, i8 %v1, i1 %c1) {
   }
 }
 
+TEST_F(SandboxIRTest, BranchInst) {
+  parseIR(C, R"IR(
+define void @foo(i1 %cond0, i1 %cond2) {
+ bb0:
+   br i1 %cond0, label %bb1, label %bb2
+ bb1:
+   ret void
+ bb2:
+   ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  sandboxir::Function *F = Ctx.createFunction(LLVMF);
+  auto *Cond0 = F->getArg(0);
+  auto *Cond1 = F->getArg(1);
+  auto *BB0 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(*LLVMF, "bb0")));
+  auto *BB1 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(*LLVMF, "bb1")));
+  auto *Ret1 = BB1->getTerminator();
+  auto *BB2 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(*LLVMF, "bb2")));
+  auto *Ret2 = BB2->getTerminator();
+  auto It = BB0->begin();
+  auto *Br0 = cast<sandboxir::BranchInst>(&*It++);
+  // Check isUnconditional().
+  EXPECT_FALSE(Br0->isUnconditional());
+  // Check isConditional().
+  EXPECT_TRUE(Br0->isConditional());
+  // Check getCondition().
+  EXPECT_EQ(Br0->getCondition(), Cond0);
+  // Check setCondition().
+  Br0->setCondition(Cond1);
+  EXPECT_EQ(Br0->getCondition(), Cond1);
+  // Check getNumSuccessors().
+  EXPECT_EQ(Br0->getNumSuccessors(), 2u);
+  // Check getSuccessor().
+  EXPECT_EQ(Br0->getSuccessor(0), BB1);
+  EXPECT_EQ(Br0->getSuccessor(1), BB2);
+  // Check swapSuccessors().
+  Br0->swapSuccessors();
+  EXPECT_EQ(Br0->getSuccessor(0), BB2);
+  EXPECT_EQ(Br0->getSuccessor(1), BB1);
+  // Check successors().
+  EXPECT_EQ(range_size(Br0->successors()), 2u);
+  unsigned SuccIdx = 0;
+  SmallVector<sandboxir::BasicBlock *> ExpectedSuccs({BB1, BB2});
+  for (sandboxir::BasicBlock *Succ : Br0->successors())
+    EXPECT_EQ(Succ, ExpectedSuccs[SuccIdx++]);
+
+  {
+    // Check unconditional BranchInst::create() InsertBefore.
+    auto *Br = sandboxir::BranchInst::create(BB1, /*InsertBefore=*/Ret1, Ctx);
+    EXPECT_FALSE(Br->isConditional());
+    EXPECT_TRUE(Br->isUnconditional());
+#ifndef NDEBUG
+    EXPECT_DEATH(Br->getCondition(), ".*condition.*");
+#endif // NDEBUG
+    unsigned SuccIdx = 0;
+    SmallVector<sandboxir::BasicBlock *> ExpectedSuccs({BB1});
+    for (sandboxir::BasicBlock *Succ : Br->successors())
+      EXPECT_EQ(Succ, ExpectedSuccs[SuccIdx++]);
+    EXPECT_EQ(Br->getNextNode(), Ret1);
+  }
+  {
+    // Check unconditional BranchInst::create() InsertAtEnd.
+    auto *Br = sandboxir::BranchInst::create(BB1, /*InsertAtEnd=*/BB1, Ctx);
+    EXPECT_FALSE(Br->isConditional());
+    EXPECT_TRUE(Br->isUnconditional());
+#ifndef NDEBUG
+    EXPECT_DEATH(Br->getCondition(), ".*condition.*");
+#endif // NDEBUG
+    unsigned SuccIdx = 0;
+    SmallVector<sandboxir::BasicBlock *> ExpectedSuccs({BB1});
+    for (sandboxir::BasicBlock *Succ : Br->successors())
+      EXPECT_EQ(Succ, ExpectedSuccs[SuccIdx++]);
+    EXPECT_EQ(Br->getPrevNode(), Ret1);
+  }
+  {
+    // Check conditional BranchInst::create() InsertBefore.
+    auto *Br = sandboxir::BranchInst::create(BB1, BB2, Cond0,
+                                             /*InsertBefore=*/Ret1, Ctx);
+    EXPECT_TRUE(Br->isConditional());
+    EXPECT_EQ(Br->getCondition(), Cond0);
+    unsigned SuccIdx = 0;
+    SmallVector<sandboxir::BasicBlock *> ExpectedSuccs({BB2, BB1});
+    for (sandboxir::BasicBlock *Succ : Br->successors())
+      EXPECT_EQ(Succ, ExpectedSuccs[SuccIdx++]);
+    EXPECT_EQ(Br->getNextNode(), Ret1);
+  }
+  {
+    // Check conditional BranchInst::create() InsertAtEnd.
+    auto *Br = sandboxir::BranchInst::create(BB1, BB2, Cond0,
+                                             /*InsertAtEnd=*/BB2, Ctx);
+    EXPECT_TRUE(Br->isConditional());
+    EXPECT_EQ(Br->getCondition(), Cond0);
+    unsigned SuccIdx = 0;
+    SmallVector<sandboxir::BasicBlock *> ExpectedSuccs({BB2, BB1});
+    for (sandboxir::BasicBlock *Succ : Br->successors())
+      EXPECT_EQ(Succ, ExpectedSuccs[SuccIdx++]);
+    EXPECT_EQ(Br->getPrevNode(), Ret2);
+  }
+}
+
 TEST_F(SandboxIRTest, LoadInst) {
   parseIR(C, R"IR(
 define void @foo(ptr %arg0, ptr %arg1) {
diff --git a/llvm/unittests/SandboxIR/TrackerTest.cpp b/llvm/unittests/SandboxIR/TrackerTest.cpp
index 354cd18..dd9dcd5 100644
--- a/llvm/unittests/SandboxIR/TrackerTest.cpp
+++ b/llvm/unittests/SandboxIR/TrackerTest.cpp
@@ -69,6 +69,49 @@ define void @foo(ptr %ptr) {
   EXPECT_EQ(Ld->getOperand(0), Gep0);
 }
 
+TEST_F(TrackerTest, SwapOperands) {
+  parseIR(C, R"IR(
+define void @foo(i1 %cond) {
+ bb0:
+   br i1 %cond, label %bb1, label %bb2
+ bb1:
+   ret void
+ bb2:
+   ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  Ctx.createFunction(&LLVMF);
+  auto *BB0 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb0")));
+  auto *BB1 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb1")));
+  auto *BB2 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb2")));
+  auto &Tracker = Ctx.getTracker();
+  Tracker.save();
+  auto It = BB0->begin();
+  auto *Br = cast<sandboxir::BranchInst>(&*It++);
+
+  unsigned SuccIdx = 0;
+  SmallVector<sandboxir::BasicBlock *> ExpectedSuccs({BB2, BB1});
+  for (auto *Succ : Br->successors())
+    EXPECT_EQ(Succ, ExpectedSuccs[SuccIdx++]);
+
+  // This calls User::swapOperandsInternal() internally.
+  Br->swapSuccessors();
+
+  SuccIdx = 0;
+  for (auto *Succ : reverse(Br->successors()))
+    EXPECT_EQ(Succ, ExpectedSuccs[SuccIdx++]);
+
+  Ctx.getTracker().revert();
+  SuccIdx = 0;
+  for (auto *Succ : Br->successors())
+    EXPECT_EQ(Succ, ExpectedSuccs[SuccIdx++]);
+}
+
 TEST_F(TrackerTest, RUWIf_RAUW_RUOW) {
   parseIR(C, R"IR(
 define void @foo(ptr %ptr) {
diff --git a/llvm/utils/mlgo-utils/mlgo/__init__.py b/llvm/utils/mlgo-utils/mlgo/__init__.py
index c5b208c..d3369ab 100644
--- a/llvm/utils/mlgo-utils/mlgo/__init__.py
+++ b/llvm/utils/mlgo-utils/mlgo/__init__.py
@@ -4,7 +4,7 @@
 
 from datetime import timezone, datetime
 
-__versioninfo__ = (19, 0, 0)
+__versioninfo__ = (20, 0, 0)
 __version__ = (
     ".".join(str(v) for v in __versioninfo__)
     + "dev"
diff --git a/llvm/utils/release/bump-version.py b/llvm/utils/release/bump-version.py
index b1799cb..5db62e8 100755
--- a/llvm/utils/release/bump-version.py
+++ b/llvm/utils/release/bump-version.py
@@ -188,6 +188,11 @@ if __name__ == "__main__":
             "llvm/utils/lit/lit/__init__.py",
             LitProcessor(args),
         ),
+        # mlgo-utils configuration
+        (
+            "llvm/utils/mlgo-utils/mlgo/__init__.py",
+            LitProcessor(args),
+        ),
         # GN build system
         (
             "llvm/utils/gn/secondary/llvm/version.gni",